1//===----------- VectorUtils.cpp - Vectorizer utility functions -----------===//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//===----------------------------------------------------------------------===//
9// This file defines vectorizer utilities.
11//===----------------------------------------------------------------------===//
31 #define DEBUG_TYPE "vectorutils"
36/// Maximum factor for an interleaved memory access.
39 cl::desc(
"Maximum factor for an interleaved access group (default = 8)"),
42/// Return true if all of the intrinsic's arguments and return type are scalars
43/// for the scalar form of the intrinsic, and vectors for the vector form of the
44/// intrinsic (except operands that are marked as always being scalar by
45/// isVectorIntrinsicWithScalarOpAtArg).
48 case Intrinsic::abs:
// Begin integer bit-manipulation.
49 case Intrinsic::bswap:
50 case Intrinsic::bitreverse:
51 case Intrinsic::ctpop:
60 case Intrinsic::sadd_sat:
61 case Intrinsic::ssub_sat:
62 case Intrinsic::uadd_sat:
63 case Intrinsic::usub_sat:
64 case Intrinsic::smul_fix:
65 case Intrinsic::smul_fix_sat:
66 case Intrinsic::umul_fix:
67 case Intrinsic::umul_fix_sat:
68 case Intrinsic::sqrt:
// Begin floating-point.
72 case Intrinsic::atan2:
75 case Intrinsic::sincos:
76 case Intrinsic::sincospi:
82 case Intrinsic::exp10:
84 case Intrinsic::ldexp:
86 case Intrinsic::log10:
89 case Intrinsic::minnum:
90 case Intrinsic::maxnum:
91 case Intrinsic::minimum:
92 case Intrinsic::maximum:
93 case Intrinsic::minimumnum:
94 case Intrinsic::maximumnum:
96 case Intrinsic::copysign:
97 case Intrinsic::floor:
99 case Intrinsic::trunc:
100 case Intrinsic::rint:
101 case Intrinsic::nearbyint:
102 case Intrinsic::round:
103 case Intrinsic::roundeven:
106 case Intrinsic::fmuladd:
107 case Intrinsic::is_fpclass:
108 case Intrinsic::powi:
109 case Intrinsic::canonicalize:
110 case Intrinsic::fptosi_sat:
111 case Intrinsic::fptoui_sat:
112 case Intrinsic::lround:
113 case Intrinsic::llround:
114 case Intrinsic::lrint:
115 case Intrinsic::llrint:
116 case Intrinsic::ucmp:
117 case Intrinsic::scmp:
130 return TTI->isTargetIntrinsicTriviallyScalarizable(
ID);
132 // TODO: Move frexp to isTriviallyVectorizable.
133 // https://github.com/llvm/llvm-project/issues/112408
135 case Intrinsic::frexp:
136 case Intrinsic::uadd_with_overflow:
137 case Intrinsic::sadd_with_overflow:
138 case Intrinsic::ssub_with_overflow:
139 case Intrinsic::usub_with_overflow:
140 case Intrinsic::umul_with_overflow:
141 case Intrinsic::smul_with_overflow:
147/// Identifies if the vector form of the intrinsic has a scalar operand.
149 unsigned ScalarOpdIdx,
153 return TTI->isTargetIntrinsicWithScalarOpAtArg(
ID, ScalarOpdIdx);
155 // Vector predication intrinsics have the EVL as the last operand.
161 case Intrinsic::vp_abs:
162 case Intrinsic::ctlz:
163 case Intrinsic::vp_ctlz:
164 case Intrinsic::cttz:
165 case Intrinsic::vp_cttz:
166 case Intrinsic::is_fpclass:
167 case Intrinsic::vp_is_fpclass:
168 case Intrinsic::powi:
169 case Intrinsic::vector_extract:
170 return (ScalarOpdIdx == 1);
171 case Intrinsic::smul_fix:
172 case Intrinsic::smul_fix_sat:
173 case Intrinsic::umul_fix:
174 case Intrinsic::umul_fix_sat:
175 return (ScalarOpdIdx == 2);
176 case Intrinsic::experimental_vp_splice:
177 return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
188 return TTI->isTargetIntrinsicWithOverloadTypeAtArg(
ID, OpdIdx);
191 return OpdIdx == -1 || OpdIdx == 0;
194 case Intrinsic::fptosi_sat:
195 case Intrinsic::fptoui_sat:
196 case Intrinsic::lround:
197 case Intrinsic::llround:
198 case Intrinsic::lrint:
199 case Intrinsic::llrint:
200 case Intrinsic::vp_lrint:
201 case Intrinsic::vp_llrint:
202 case Intrinsic::ucmp:
203 case Intrinsic::scmp:
204 case Intrinsic::vector_extract:
205 return OpdIdx == -1 || OpdIdx == 0;
206 case Intrinsic::modf:
207 case Intrinsic::sincos:
208 case Intrinsic::sincospi:
209 case Intrinsic::is_fpclass:
210 case Intrinsic::vp_is_fpclass:
212 case Intrinsic::powi:
213 case Intrinsic::ldexp:
214 return OpdIdx == -1 || OpdIdx == 1;
224 return TTI->isTargetIntrinsicWithStructReturnOverloadAtField(
ID, RetIdx);
227 case Intrinsic::frexp:
228 return RetIdx == 0 || RetIdx == 1;
234/// Returns intrinsic ID for call.
235/// For the input call instruction it finds mapping intrinsic and returns
236/// its ID, in case it does not found it return not_intrinsic.
244 ID == Intrinsic::lifetime_end ||
ID == Intrinsic::assume ||
245 ID == Intrinsic::experimental_noalias_scope_decl ||
246 ID == Intrinsic::sideeffect ||
ID == Intrinsic::pseudoprobe)
253 case Intrinsic::vector_interleave2:
255 case Intrinsic::vector_interleave3:
257 case Intrinsic::vector_interleave4:
259 case Intrinsic::vector_interleave5:
261 case Intrinsic::vector_interleave6:
263 case Intrinsic::vector_interleave7:
265 case Intrinsic::vector_interleave8:
274 case Intrinsic::vector_deinterleave2:
276 case Intrinsic::vector_deinterleave3:
278 case Intrinsic::vector_deinterleave4:
280 case Intrinsic::vector_deinterleave5:
282 case Intrinsic::vector_deinterleave6:
284 case Intrinsic::vector_deinterleave7:
286 case Intrinsic::vector_deinterleave8:
294 [[maybe_unused]]
unsigned Factor =
297 assert(Factor && Factor == DISubtypes.
size() &&
298 "unexpected deinterleave factor or result type");
302/// Given a vector and an element number, see if the scalar value is
303/// already around as a register, for example if it were inserted then extracted
306 assert(V->getType()->isVectorTy() &&
"Not looking at a vector?");
308 // For fixed-length vector, return poison for out of range access.
310 unsigned Width = FVTy->getNumElements();
316 return C->getAggregateElement(EltNo);
319 // If this is an insert to a variable element, we don't know what it is.
324 // If this is an insert to the element we are looking for, return the
327 return III->getOperand(1);
329 // Guard against infinite loop on malformed, unreachable IR.
330 if (III == III->getOperand(0))
333 // Otherwise, the insertelement doesn't modify the value, recurse on its
339 // Restrict the following transformation to fixed-length vector.
346 if (InEl < (
int)LHSWidth)
351 // Extract a value from a vector add operation with a constant zero.
352 // TODO: Use getBinOpIdentity() to generalize this.
355 if (
Constant *Elt =
C->getAggregateElement(EltNo))
356 if (Elt->isNullValue())
359 // If the vector is a splat then we can trivially find the scalar element.
362 if (EltNo < VTy->getElementCount().getKnownMinValue())
365 // Otherwise, we don't know.
372 // Ignore invalid (undefined) mask elements.
376 // There can be only 1 non-negative mask element value if this is a splat.
377 if (SplatIndex != -1 && SplatIndex != M)
380 // Initialize the splat index to the 1st non-negative mask element.
383 assert((SplatIndex == -1 || SplatIndex >= 0) &&
"Negative index?");
387/// Get splat value if the input is a splat vector or return nullptr.
388/// This function is not fully general. It checks only 2 cases:
389/// the input value is (1) a splat constant vector or (2) a sequence
390/// of instructions that broadcasts a scalar at element 0.
394 return C->getSplatValue();
396 // shuf (inselt ?, Splat, 0), ?, <0, undef, 0, ...>
412 // FIXME: We can allow undefs, but if Index was specified, we may want to
413 // check that the constant is defined at that index.
415 return C->getSplatValue() !=
nullptr;
419 // FIXME: We can safely allow undefs here. If Index was specified, we will
420 // check that the mask elt is defined at the required index.
428 // Match a specific element. The mask should be defined at and match the
430 return Shuf->getMaskValue(Index) == Index;
433 // The remaining tests are all recursive, so bail out if we hit the limit.
437 // If both operands of a binop are splats, the result is a splat.
442 // If all operands of a select are splats, the result is a splat.
447 // TODO: Add support for unary ops (fneg), casts, intrinsics (overflow ops).
453 const APInt &DemandedElts,
APInt &DemandedLHS,
454 APInt &DemandedRHS,
bool AllowUndefElts) {
457 // Early out if we don't demand any elements.
458 if (DemandedElts.
isZero())
461 // Simple case of a shuffle with zeroinitializer.
462 if (
all_of(Mask, [](
int Elt) {
return Elt == 0; })) {
467 for (
unsigned I = 0, E = Mask.size();
I != E; ++
I) {
469 assert((-1 <= M) && (M < (SrcWidth * 2)) &&
470 "Invalid shuffle mask constant");
472 if (!DemandedElts[
I] || (AllowUndefElts && (M < 0)))
475 // For undef elements, we don't know anything about the common state of
476 // the shuffle result.
483 DemandedRHS.
setBit(M - SrcWidth);
490 std::array<std::pair<int, int>, 2> &SrcInfo) {
491 const int SignalValue = NumElts * 2;
492 SrcInfo[0] = {-1, SignalValue};
493 SrcInfo[1] = {-1, SignalValue};
497 int Src = M >= (int)NumElts;
498 int Diff = (int)i - (M % NumElts);
500 for (
int j = 0; j < 2; j++) {
501 auto &[SrcE, DiffE] = SrcInfo[j];
503 assert(DiffE == SignalValue);
507 if (SrcE == Src && DiffE == Diff) {
515 // Avoid all undef masks
516 return SrcInfo[0].first != -1;
521 assert(Scale > 0 &&
"Unexpected scaling factor");
523 // Fast-path: if no scaling, then it is just a copy.
525 ScaledMask.
assign(Mask.begin(), Mask.end());
530 for (
int MaskElt : Mask) {
533 "Overflowed 32-bits");
535 for (
int SliceElt = 0; SliceElt != Scale; ++SliceElt)
536 ScaledMask.
push_back(MaskElt < 0 ? MaskElt : Scale * MaskElt + SliceElt);
542 assert(Scale > 0 &&
"Unexpected scaling factor");
544 // Fast-path: if no scaling, then it is just a copy.
546 ScaledMask.
assign(Mask.begin(), Mask.end());
550 // We must map the original elements down evenly to a type with less elements.
551 int NumElts = Mask.size();
552 if (NumElts % Scale != 0)
556 ScaledMask.
reserve(NumElts / Scale);
558 // Step through the input mask by splitting into Scale-sized slices.
561 assert((
int)MaskSlice.
size() == Scale &&
"Expected Scale-sized slice.");
563 // The first element of the slice determines how we evaluate this slice.
564 int SliceFront = MaskSlice.
front();
565 if (SliceFront < 0) {
566 // Negative values (undef or other "sentinel" values) must be equal across
572 // A positive mask element must be cleanly divisible.
573 if (SliceFront % Scale != 0)
575 // Elements of the slice must be consecutive.
576 for (
int i = 1; i < Scale; ++i)
577 if (MaskSlice[i] != SliceFront + i)
579 ScaledMask.
push_back(SliceFront / Scale);
581 Mask = Mask.drop_front(Scale);
582 }
while (!Mask.empty());
584 assert((
int)ScaledMask.
size() * Scale == NumElts &&
"Unexpected scaled mask");
586 // All elements of the original mask can be scaled down to map to the elements
587 // of a mask with wider elements.
593 unsigned NumElts = M.size();
594 if (NumElts % 2 != 0)
598 for (
unsigned i = 0; i < NumElts; i += 2) {
602 // If both elements are undef, new mask is undef too.
603 if (
M0 == -1 &&
M1 == -1) {
608 if (
M0 == -1 &&
M1 != -1 && (
M1 % 2) == 1) {
613 if (
M0 != -1 && (
M0 % 2) == 0 && ((
M0 + 1) ==
M1 ||
M1 == -1)) {
622 assert(NewMask.
size() == NumElts / 2 &&
"Incorrect size for mask!");
628 unsigned NumSrcElts = Mask.size();
629 assert(NumSrcElts > 0 && NumDstElts > 0 &&
"Unexpected scaling factor");
631 // Fast-path: if no scaling, then it is just a copy.
632 if (NumSrcElts == NumDstElts) {
633 ScaledMask.
assign(Mask.begin(), Mask.end());
637 // Ensure we can find a whole scale factor.
638 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
639 "Unexpected scaling factor");
641 if (NumSrcElts > NumDstElts) {
642 int Scale = NumSrcElts / NumDstElts;
646 int Scale = NumDstElts / NumSrcElts;
653 std::array<SmallVector<int, 16>, 2> TmpMasks;
656 for (
unsigned Scale = 2; Scale <= InputMask.
size(); ++Scale) {
666 ArrayRef<int> Mask,
unsigned NumOfSrcRegs,
unsigned NumOfDestRegs,
667 unsigned NumOfUsedRegs,
function_ref<
void()> NoInputAction,
672 // Try to perform better estimation of the permutation.
673 // 1. Split the source/destination vectors into real registers.
674 // 2. Do the mask analysis to identify which real registers are
676 int Sz = Mask.size();
677 unsigned SzDest = Sz / NumOfDestRegs;
678 unsigned SzSrc = Sz / NumOfSrcRegs;
679 for (
unsigned I = 0;
I < NumOfDestRegs; ++
I) {
680 auto &RegMasks = Res[
I];
681 RegMasks.
assign(2 * NumOfSrcRegs, {});
682 // Check that the values in dest registers are in the one src
684 for (
unsigned K = 0; K < SzDest; ++K) {
685 int Idx =
I * SzDest + K;
690 int MaskIdx = Mask[Idx] % Sz;
691 int SrcRegIdx = MaskIdx / SzSrc + (Mask[Idx] >= Sz ? NumOfSrcRegs : 0);
692 // Add a cost of PermuteTwoSrc for each new source register permute,
693 // if we have more than one source registers.
694 if (RegMasks[SrcRegIdx].empty())
696 RegMasks[SrcRegIdx][K] = MaskIdx % SzSrc;
699 // Process split mask.
704 switch (NumSrcRegs) {
706 // No input vectors were used!
710 // Find the only mask with at least single undef mask elem.
713 unsigned SrcReg = std::distance(Dest.begin(), It);
714 SingleInputAction(*It, SrcReg,
I);
718 // The first mask is a permutation of a single register. Since we have >2
719 // input registers to shuffle, we merge the masks for 2 first registers
720 // and generate a shuffle of 2 registers rather than the reordering of the
721 // first register and then shuffle with the second register. Next,
722 // generate the shuffles of the resulting register + the remaining
723 // registers from the list.
726 for (
int Idx = 0, VF = FirstMask.
size(); Idx < VF; ++Idx) {
729 "Expected undefined mask element.");
730 FirstMask[Idx] = SecondMask[Idx] + VF;
735 for (
int Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
751 if (FirstIdx == SecondIdx) {
757 SecondMask = RegMask;
758 CombineMasks(FirstMask, SecondMask);
759 ManyInputsAction(FirstMask, FirstIdx, SecondIdx, NewReg);
761 NormalizeMask(FirstMask);
763 SecondMask = FirstMask;
764 SecondIdx = FirstIdx;
766 if (FirstIdx != SecondIdx && SecondIdx >= 0) {
767 CombineMasks(SecondMask, FirstMask);
768 ManyInputsAction(SecondMask, SecondIdx, FirstIdx, NewReg);
770 Dest[FirstIdx].clear();
771 NormalizeMask(SecondMask);
773 }
while (SecondIdx >= 0);
781 const APInt &DemandedElts,
783 APInt &DemandedRHS) {
784 assert(VectorBitWidth >= 128 &&
"Vectors smaller than 128 bit not supported");
785 int NumLanes = VectorBitWidth / 128;
787 int NumEltsPerLane = NumElts / NumLanes;
788 int HalfEltsPerLane = NumEltsPerLane / 2;
793 // Map DemandedElts to the horizontal operands.
794 for (
int Idx = 0; Idx != NumElts; ++Idx) {
795 if (!DemandedElts[Idx])
797 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
798 int LocalIdx = Idx % NumEltsPerLane;
799 if (LocalIdx < HalfEltsPerLane) {
800 DemandedLHS.
setBit(LaneIdx + 2 * LocalIdx);
802 LocalIdx -= HalfEltsPerLane;
803 DemandedRHS.
setBit(LaneIdx + 2 * LocalIdx);
812 // DemandedBits will give us every value's live-out bits. But we want
813 // to ensure no extra casts would need to be inserted, so every DAG
814 // of connected values must have the same minimum bitwidth.
823 // Determine the roots. We work bottom-up, from truncs or icmps.
824 bool SeenExtFromIllegalType =
false;
825 for (
auto *BB : Blocks)
826 for (
auto &
I : *BB) {
827 InstructionSet.insert(&
I);
830 !
TTI->isTypeLegal(
I.getOperand(0)->getType()))
831 SeenExtFromIllegalType =
true;
833 // Only deal with non-vector integers up to 64-bits wide.
835 !
I.getType()->isVectorTy() &&
836 I.getOperand(0)->getType()->getScalarSizeInBits() <= 64) {
837 // Don't make work for ourselves. If we know the loaded type is legal,
838 // don't add it to the worklist.
847 if (Worklist.
empty() || (
TTI && !SeenExtFromIllegalType))
850 // Now proceed breadth-first, unioning values together.
851 while (!Worklist.
empty()) {
858 // If we encounter a type that is larger than 64 bits, we can't represent
860 if (DB.getDemandedBits(
I).getBitWidth() > 64)
863 uint64_t V = DB.getDemandedBits(
I).getZExtValue();
867 // Casts, loads and instructions outside of our range terminate a chain
870 !InstructionSet.count(
I))
873 // Unsafe casts terminate a chain unsuccessfully. We can't do anything
874 // useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
875 // transform anything that relies on them.
877 !
I->getType()->isIntegerTy()) {
878 DBits[Leader] |= ~0ULL;
882 // We don't modify the types of PHIs. Reductions will already have been
883 // truncated if possible, and inductions' sizes will have been chosen by
888 // Don't modify the types of operands of a call, as doing that would cause a
889 // signature mismatch.
893 if (DBits[Leader] == ~0ULL)
894 // All bits demanded, no point continuing.
897 for (
Value *O :
I->operands()) {
904 // Now we've discovered all values, walk them to see if there are
905 // any users we didn't see. If there are, we can't optimize that
907 for (
auto &
I : DBits)
908 for (
auto *U :
I.first->users())
909 if (U->getType()->isIntegerTy() && DBits.
count(U) == 0)
912 for (
const auto &E : ECs) {
917 LeaderDemandedBits |= DBits[M];
920 // Round up to a power of 2
923 // We don't modify the types of PHIs. Reductions will already have been
924 // truncated if possible, and inductions' sizes will have been chosen by
926 // If we are required to shrink a PHI, abandon this entire equivalence class.
940 Type *Ty = M->getType();
942 Ty =
MI->getOperand(0)->getType();
944 if (MinBW >= Ty->getScalarSizeInBits())
947 // If any of M's operands demand more bits than MinBW then M cannot be
948 // performed safely in MinBW.
953 // For constants shift amounts, check if the shift would result in
957 U.getOperandNo() == 1)
958 return CI->uge(MinBW);
971/// Add all access groups in @p AccGroups to @p List.
972template <
typename ListT>
974 // Interpret an access group as a list containing itself.
977 List.insert(AccGroups);
981 for (
const auto &AccGroupListOp : AccGroups->
operands()) {
993 if (AccGroups1 == AccGroups2)
1000 if (Union.size() == 0)
1002 if (Union.size() == 1)
1014 if (!MayAccessMem1 && !MayAccessMem2)
1017 return Inst2->
getMetadata(LLVMContext::MD_access_group);
1019 return Inst1->
getMetadata(LLVMContext::MD_access_group);
1028 // Use set for scalable 'contains' check.
1035 if (AccGroupSet2.
count(MD1))
1041 if (AccGroupSet2.
count(Item))
1046 if (Intersection.
size() == 0)
1048 if (Intersection.
size() == 1)
1055/// Add metadata from \p Inst to \p Metadata, if it can be preserved after
1061 static const unsigned SupportedIDs[] = {
1062 LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
1063 LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
1064 LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load,
1065 LLVMContext::MD_access_group, LLVMContext::MD_mmra};
1067 // Remove any unsupported metadata kinds from Metadata.
1068 for (
unsigned Idx = 0; Idx !=
Metadata.size();) {
1072 // Swap element to end and remove it.
1079/// \returns \p I after propagating metadata from \p VL.
1086 for (
auto &[Kind, MD] :
Metadata) {
1087 for (
int J = 1, E = VL.
size(); MD && J != E; ++J) {
1092 case LLVMContext::MD_mmra: {
1096 case LLVMContext::MD_tbaa:
1099 case LLVMContext::MD_alias_scope:
1102 case LLVMContext::MD_fpmath:
1105 case LLVMContext::MD_noalias:
1106 case LLVMContext::MD_nontemporal:
1107 case LLVMContext::MD_invariant_load:
1110 case LLVMContext::MD_access_group:
1127 // All 1's means mask is not needed.
1131 // TODO: support reversed access.
1135 for (
unsigned i = 0; i < VF; i++)
1136 for (
unsigned j = 0; j < Group.
getFactor(); ++j) {
1137 unsigned HasMember = Group.
getMember(j) ? 1 : 0;
1138 Mask.push_back(Builder.getInt1(HasMember));
1147 for (
unsigned i = 0; i < VF; i++)
1148 for (
unsigned j = 0; j < ReplicationFactor; j++)
1157 for (
unsigned i = 0; i < VF; i++)
1158 for (
unsigned j = 0; j < NumVecs; j++)
1159 Mask.push_back(j * VF + i);
1167 for (
unsigned i = 0; i < VF; i++)
1168 Mask.push_back(Start + i * Stride);
1175 unsigned NumUndefs) {
1177 for (
unsigned i = 0; i < NumInts; i++)
1178 Mask.push_back(Start + i);
1180 for (
unsigned i = 0; i < NumUndefs; i++)
1188 // Avoid casts in the loop and make sure we have a reasonable number.
1189 int NumEltsSigned = NumElts;
1190 assert(NumEltsSigned > 0 &&
"Expected smaller or non-zero element count");
1192 // If the mask chooses an element from operand 1, reduce it to choose from the
1193 // corresponding element of operand 0. Undef mask elements are unchanged.
1195 for (
int MaskElt : Mask) {
1196 assert((MaskElt < NumEltsSigned * 2) &&
"Expected valid shuffle mask");
1197 int UnaryElt = MaskElt >= NumEltsSigned ? MaskElt - NumEltsSigned : MaskElt;
1203/// A helper function for concatenating vectors. This function concatenates two
1204/// vectors having the same element type. If the second vector has fewer
1205/// elements than the first, it is padded with undefs.
1210 assert(VecTy1 && VecTy2 &&
1211 VecTy1->getScalarType() == VecTy2->getScalarType() &&
1212 "Expect two vectors with the same element type");
1216 assert(NumElts1 >= NumElts2 &&
"Unexpect the first vector has less elements");
1218 if (NumElts1 > NumElts2) {
1219 // Extend with UNDEFs.
1220 V2 = Builder.CreateShuffleVector(
1224 return Builder.CreateShuffleVector(
1230 unsigned NumVecs = Vecs.
size();
1231 assert(NumVecs > 1 &&
"Should be at least two vectors");
1237 for (
unsigned i = 0; i < NumVecs - 1; i += 2) {
1238 Value *V0 = ResList[i], *V1 = ResList[i + 1];
1239 assert((V0->
getType() == V1->getType() || i == NumVecs - 2) &&
1240 "Only the last vector may have a different type");
1245 // Push the last vector if the total number of vectors is odd.
1246 if (NumVecs % 2 != 0)
1247 TmpList.
push_back(ResList[NumVecs - 1]);
1250 NumVecs = ResList.
size();
1251 }
while (NumVecs > 1);
1261 "Mask must be a vector of i1");
1274 if (
auto *MaskElt = ConstMask->getAggregateElement(
I))
1287 "Mask must be a vector of i1");
1300 if (
auto *MaskElt = ConstMask->getAggregateElement(
I))
1313 "Mask must be a vector of i1");
1326 if (
auto *MaskElt = ConstMask->getAggregateElement(
I))
1333/// TODO: This is a lot like known bits, but for
1334/// vectors. Is there something we can common this with?
1340 "Mask must be a fixed width vector of i1");
1342 const unsigned VWidth =
1346 for (
unsigned i = 0; i < VWidth; i++)
1347 if (CV->getAggregateElement(i)->isNullValue())
1349 return DemandedElts;
1352bool InterleavedAccessInfo::isStrided(
int Stride) {
1353 unsigned Factor = std::abs(Stride);
1357void InterleavedAccessInfo::collectConstStrideAccesses(
1360 auto &
DL = TheLoop->getHeader()->getDataLayout();
1362 // Since it's desired that the load/store instructions be maintained in
1363 // "program order" for the interleaved access analysis, we have to visit the
1364 // blocks in the loop in reverse postorder (i.e., in a topological order).
1365 // Such an ordering will ensure that any load/store that may be executed
1366 // before a second load/store will precede the second load/store in
1367 // AccessStrideInfo.
1368 LoopBlocksDFS DFS(TheLoop);
1370 for (BasicBlock *BB :
make_range(DFS.beginRPO(), DFS.endRPO()))
1371 for (
auto &
I : *BB) {
1377 // Currently, codegen doesn't support cases where the type size doesn't
1378 // match the alloc size. Skip them for now.
1379 uint64_t
Size =
DL.getTypeAllocSize(ElementTy);
1380 if (
Size * 8 !=
DL.getTypeSizeInBits(ElementTy))
1383 // We don't check wrapping here because we don't know yet if Ptr will be
1384 // part of a full group or a group with gaps. Checking wrapping for all
1385 // pointers (even those that end up in groups with no gaps) will be overly
1386 // conservative. For full groups, wrapping should be ok since if we would
1387 // wrap around the address space we would do a memory access at nullptr
1388 // even without the transformation. The wrapping checks are therefore
1389 // deferred until after we've formed the interleaved groups.
1390 int64_t Stride =
getPtrStride(PSE, ElementTy,
Ptr, TheLoop, *DT, Strides,
1391 /*Assume=*/true,
/*ShouldCheckWrap=*/false)
1395 AccessStrideInfo[&
I] = StrideDescriptor(Stride, Scev,
Size,
1400// Analyze interleaved accesses and collect them into interleaved load and
1403// When generating code for an interleaved load group, we effectively hoist all
1404// loads in the group to the location of the first load in program order. When
1405// generating code for an interleaved store group, we sink all stores to the
1406// location of the last store. This code motion can change the order of load
1407// and store instructions and may break dependences.
1409// The code generation strategy mentioned above ensures that we won't violate
1410// any write-after-read (WAR) dependences.
1412// E.g., for the WAR dependence: a = A[i]; // (1)
1415// The store group of (2) is always inserted at or below (2), and the load
1416// group of (1) is always inserted at or above (1). Thus, the instructions will
1417// never be reordered. All other dependences are checked to ensure the
1418// correctness of the instruction reordering.
1420// The algorithm visits all memory accesses in the loop in bottom-up program
1421// order. Program order is established by traversing the blocks in the loop in
1422// reverse postorder when collecting the accesses.
1424// We visit the memory accesses in bottom-up order because it can simplify the
1425// construction of store groups in the presence of write-after-write (WAW)
1428// E.g., for the WAW dependence: A[i] = a; // (1)
1430// A[i + 1] = c; // (3)
1432// We will first create a store group with (3) and (2). (1) can't be added to
1433// this group because it and (2) are dependent. However, (1) can be grouped
1434// with other accesses that may precede it in program order. Note that a
1435// bottom-up order does not imply that WAW dependences should not be checked.
1437 bool EnablePredicatedInterleavedMemAccesses) {
1439 const auto &Strides = LAI->getSymbolicStrides();
1441 // Holds all accesses with a constant stride.
1443 collectConstStrideAccesses(AccessStrideInfo, Strides);
1445 if (AccessStrideInfo.
empty())
1448 // Collect the dependences in the loop.
1449 collectDependences();
1451 // Holds all interleaved store groups temporarily.
1453 // Holds all interleaved load groups temporarily.
1455 // Groups added to this set cannot have new members added.
1458 // Search in bottom-up program order for pairs of accesses (A and B) that can
1459 // form interleaved load or store groups. In the algorithm below, access A
1460 // precedes access B in program order. We initialize a group for B in the
1461 // outer loop of the algorithm, and then in the inner loop, we attempt to
1462 // insert each A into B's group if:
1464 // 1. A and B have the same stride,
1465 // 2. A and B have the same memory object size, and
1466 // 3. A belongs in B's group according to its distance from B.
1468 // Special care is taken to ensure group formation will not break any
1470 for (
auto BI = AccessStrideInfo.
rbegin(), E = AccessStrideInfo.
rend();
1473 StrideDescriptor DesB = BI->second;
1475 // Initialize a group for B if it has an allowable stride. Even if we don't
1476 // create a group for B, we continue with the bottom-up algorithm to ensure
1477 // we don't break any of B's dependences.
1479 if (isStrided(DesB.Stride) &&
1480 (!isPredicated(
B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
1485 GroupB = createInterleaveGroup(
B, DesB.Stride, DesB.Alignment);
1486 if (
B->mayWriteToMemory())
1487 StoreGroups.
insert(GroupB);
1489 LoadGroups.
insert(GroupB);
1493 for (
auto AI = std::next(BI); AI != E; ++AI) {
1495 StrideDescriptor DesA = AI->second;
1497 // Our code motion strategy implies that we can't have dependences
1498 // between accesses in an interleaved group and other accesses located
1499 // between the first and last member of the group. Note that this also
1500 // means that a group can't have more than one member at a given offset.
1501 // The accesses in a group can have dependences with other accesses, but
1502 // we must ensure we don't extend the boundaries of the group such that
1503 // we encompass those dependent accesses.
1505 // For example, assume we have the sequence of accesses shown below in a
1508 // (1, 2) is a group | A[i] = a; // (1)
1509 // | A[i-1] = b; // (2) |
1510 // A[i-3] = c; // (3)
1511 // A[i] = d; // (4) | (2, 4) is not a group
1513 // Because accesses (2) and (3) are dependent, we can group (2) with (1)
1514 // but not with (4). If we did, the dependent access (3) would be within
1515 // the boundaries of the (2, 4) group.
1520 if (MemberOfGroupB && !canReorderMemAccessesForInterleavedGroups(
1521 A, &*AccessStrideInfo.
find(MemberOfGroupB)))
1522 return MemberOfGroupB;
1528 // If A is a load, dependencies are tolerable, there's nothing to do here.
1529 // If both A and B belong to the same (store) group, they are independent,
1530 // even if dependencies have not been recorded.
1531 // If both GroupA and GroupB are null, there's nothing to do here.
1532 if (
A->mayWriteToMemory() && GroupA != GroupB) {
1534 // If GroupB is a load group, we have to compare AI against all
1535 // members of GroupB because if any load within GroupB has a dependency
1536 // on AI, we need to mark GroupB as complete and also release the
1537 // store GroupA (if A belongs to one). The former prevents incorrect
1538 // hoisting of load B above store A while the latter prevents incorrect
1539 // sinking of store A below load B.
1540 if (GroupB && LoadGroups.
contains(GroupB))
1541 DependentInst = DependentMember(GroupB, &*AI);
1542 else if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI))
1545 if (DependentInst) {
1546 // A has a store dependence on B (or on some load within GroupB) and
1547 // is part of a store group. Release A's group to prevent illegal
1548 // sinking of A below B. A will then be free to form another group
1549 // with instructions that precede it.
1550 if (GroupA && StoreGroups.
contains(GroupA)) {
1552 "dependence between "
1553 << *
A <<
" and " << *DependentInst <<
'\n');
1554 StoreGroups.
remove(GroupA);
1555 releaseGroup(GroupA);
1557 // If B is a load and part of an interleave group, no earlier loads
1558 // can be added to B's interleave group, because this would mean the
1559 // DependentInst would move across store A. Mark the interleave group
1561 if (GroupB && LoadGroups.
contains(GroupB)) {
1563 <<
" as complete.\n");
1564 CompletedLoadGroups.
insert(GroupB);
1568 if (CompletedLoadGroups.
contains(GroupB)) {
1569 // Skip trying to add A to B, continue to look for other conflicting A's
1570 // in groups to be released.
1574 // At this point, we've checked for illegal code motion. If either A or B
1575 // isn't strided, there's nothing left to do.
1576 if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
1579 // Ignore A if it's already in a group or isn't the same kind of memory
1581 // Note that mayReadFromMemory() isn't mutually exclusive to
1582 // mayWriteToMemory in the case of atomic loads. We shouldn't see those
1583 // here, canVectorizeMemory() should have returned false - except for the
1584 // case we asked for optimization remarks.
1586 (
A->mayReadFromMemory() !=
B->mayReadFromMemory()) ||
1587 (
A->mayWriteToMemory() !=
B->mayWriteToMemory()))
1590 // Check rules 1 and 2. Ignore A if its stride or size is different from
1592 if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
1595 // Ignore A if the memory object of A and B don't belong to the same
1600 // Calculate the distance from A to B.
1602 PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
1607 // Check rule 3. Ignore A if its distance to B is not a multiple of the
1609 if (DistanceToB %
static_cast<int64_t
>(DesB.Size))
1612 // All members of a predicated interleave-group must have the same predicate,
1613 // and currently must reside in the same BB.
1616 if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
1617 (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
1620 // The index of A is the index of B plus A's distance to B in multiples
1623 GroupB->
getIndex(
B) + DistanceToB /
static_cast<int64_t
>(DesB.Size);
1625 // Try to insert A into B's group.
1628 <<
" into the interleave group with" << *
B
1630 InterleaveGroupMap[
A] = GroupB;
1632 // Set the first load in program order as the insert position.
1633 if (
A->mayReadFromMemory())
1636 }
// Iteration over A accesses.
1637 }
// Iteration over B accesses.
1641 const char *FirstOrLast) ->
bool {
1643 assert(Member &&
"Group member does not exist");
1646 if (
getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides,
1647 /*Assume=*/false,
/*ShouldCheckWrap=*/true)
1650 LLVM_DEBUG(
dbgs() <<
"LV: Invalidate candidate interleaved group due to "
1652 <<
" group member potentially pointer-wrapping.\n");
1653 releaseGroup(Group);
1657 // Remove interleaved groups with gaps whose memory
1658 // accesses may wrap around. We have to revisit the getPtrStride analysis,
1659 // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
1660 // not check wrapping (see documentation there).
1661 // FORNOW we use Assume=false;
1662 // TODO: Change to Assume=true but making sure we don't exceed the threshold
1663 // of runtime SCEV assumptions checks (thereby potentially failing to
1664 // vectorize altogether).
1665 // Additional optional optimizations:
1666 // TODO: If we are peeling the loop and we know that the first pointer doesn't
1667 // wrap then we can deduce that all pointers in the group don't wrap.
1668 // This means that we can forcefully peel the loop in order to only have to
1669 // check the first pointer for no-wrap. When we'll change to use Assume=true
1670 // we'll only need at most one runtime check per interleaved group.
1671 for (
auto *Group : LoadGroups) {
1672 // Case 1: A full group. Can Skip the checks; For full groups, if the wide
1673 // load would wrap around the address space we would do a memory access at
1674 // nullptr even without the transformation.
1678 // Case 2: If first and last members of the group don't wrap this implies
1679 // that all the pointers in the group don't wrap.
1680 // So we check only group member 0 (which is always guaranteed to exist),
1681 // and group member Factor - 1; If the latter doesn't exist we rely on
1682 // peeling (if it is a non-reversed access -- see Case 3).
1683 if (InvalidateGroupIfMemberMayWrap(Group, 0,
"first"))
1686 InvalidateGroupIfMemberMayWrap(Group, Group->
getFactor() - 1,
"last");
1688 // Case 3: A non-reversed interleaved load group with gaps: We need
1689 // to execute at least one scalar epilogue iteration. This will ensure
1690 // we don't speculatively access memory out-of-bounds. We only need
1691 // to look for a member at index factor - 1, since every group must have
1692 // a member at index zero.
1695 dbgs() <<
"LV: Invalidate candidate interleaved group due to "
1696 "a reverse access with gaps.\n");
1697 releaseGroup(Group);
1701 dbgs() <<
"LV: Interleaved group requires epilogue iteration.\n");
1702 RequiresScalarEpilogue =
true;
1706 for (
auto *Group : StoreGroups) {
1707 // Case 1: A full group. Can Skip the checks; For full groups, if the wide
1708 // store would wrap around the address space we would do a memory access at
1709 // nullptr even without the transformation.
1713 // Interleave-store-group with gaps is implemented using masked wide store.
1714 // Remove interleaved store groups with gaps if
1715 // masked-interleaved-accesses are not enabled by the target.
1716 if (!EnablePredicatedInterleavedMemAccesses) {
1718 dbgs() <<
"LV: Invalidate candidate interleaved store group due "
1720 releaseGroup(Group);
1724 // Case 2: If first and last members of the group don't wrap this implies
1725 // that all the pointers in the group don't wrap.
1726 // So we check only group member 0 (which is always guaranteed to exist),
1727 // and the last group member. Case 3 (scalar epilog) is not relevant for
1728 // stores with gaps, which are implemented with masked-store (rather than
1729 // speculative access, as in loads).
1730 if (InvalidateGroupIfMemberMayWrap(Group, 0,
"first"))
1732 for (
int Index = Group->
getFactor() - 1; Index > 0; Index--)
1734 InvalidateGroupIfMemberMayWrap(Group, Index,
"last");
1741 // If no group had triggered the requirement to create an epilogue loop,
1742 // there is nothing to do.
1746 // Release groups requiring scalar epilogues. Note that this also removes them
1747 // from InterleaveGroups.
1748 bool ReleasedGroup = InterleaveGroups.
remove_if([&](
auto *Group) {
1749 if (!Group->requiresScalarEpilogue())
1753 <<
"LV: Invalidate candidate interleaved group due to gaps that "
1754 "require a scalar epilogue (not allowed under optsize) and cannot "
1755 "be masked (not enabled). \n");
1756 releaseGroupWithoutRemovingFromSet(Group);
1759 assert(ReleasedGroup &&
"At least one group must be invalidated, as a "
1760 "scalar epilogue was required");
1761 (void)ReleasedGroup;
1762 RequiresScalarEpilogue =
false;
1765template <
typename InstT>
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static Value * concatenateTwoVectors(IRBuilderBase &Builder, Value *V1, Value *V2)
A helper function for concatenating vectors.
static cl::opt< unsigned > MaxInterleaveGroupFactor("max-interleave-group-factor", cl::Hidden, cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8))
Maximum factor for an interleaved memory access.
static void addToAccessGroupList(ListT &List, MDNode *AccGroups)
Add all access groups in AccGroups to List.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
unsigned getBitWidth() const
Return the number of bits in the APInt.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
int64_t getSExtValue() const
Get sign extended value.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
LLVM Basic Block Representation.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
iterator_range< member_iterator > members(const ECValue &ECV) const
const ElemTy & getOrInsertLeaderValue(const ElemTy &V)
getOrInsertLeaderValue - Return the leader for the specified value that is in the set.
member_iterator unionSets(const ElemTy &V1, const ElemTy &V2)
union - Merge the two equivalence sets for the specified values, inserting them if they do not alread...
Common base class shared among various IRBuilders.
This instruction inserts a single (scalar) element into a VectorType value.
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadataOtherThanDebugLoc(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
This does the same thing as getAllMetadata, except that it filters out the debug location.
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isFull() const
Return true if this group is full, i.e. it has no gaps.
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
void setInsertPos(InstTy *Inst)
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
bool insertMember(InstTy *Instr, int32_t Index, Align NewAlign)
Try to insert a new member Instr with index Index and alignment NewAlign.
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
LLVM_ABI void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
static LLVM_ABI MDNode * getMostGenericAliasScope(MDNode *A, MDNode *B)
static LLVM_ABI MDNode * getMostGenericTBAA(MDNode *A, MDNode *B)
ArrayRef< MDOperand > operands() const
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static LLVM_ABI MDNode * getMostGenericFPMath(MDNode *A, MDNode *B)
unsigned getNumOperands() const
Return number of MDNode operands.
static LLVM_ABI MDNode * intersect(MDNode *A, MDNode *B)
LLVMContext & getContext() const
Tracking metadata reference owned by Metadata.
This class implements a map that also provides access to all stored values in a deterministic order.
iterator find(const KeyT &Key)
reverse_iterator rbegin()
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class represents a constant integer value.
const APInt & getAPInt() const
bool remove(const value_type &X)
Remove an item from the set vector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
ArrayRef< Type * > subtypes() const
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
static LLVM_ABI bool isVPCast(Intrinsic::ID ID)
static LLVM_ABI std::optional< unsigned > getVectorLengthParamPos(Intrinsic::ID IntrinsicID)
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Base class of all SIMD vector types.
Type * getElementType() const
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI bool isTargetIntrinsic(ID IID)
isTargetIntrinsic - Returns true if IID is an intrinsic specific to a certain target.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
bool match(Val *V, const Pattern &P)
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool isTriviallyScalarizable(Intrinsic::ID ID, const TargetTransformInfo *TTI)
Identify if the intrinsic is trivially scalarizable.
FunctionAddr VTableAddr Value
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
LLVM_ABI APInt possiblyDemandedEltsInMask(Value *Mask)
Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y) for each lane which may be ...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
LLVM_ABI llvm::SmallVector< int, 16 > createUnaryMask(ArrayRef< int > Mask, unsigned NumElts)
Given a shuffle mask for a binary shuffle, create the equivalent shuffle mask assuming both operands ...
LLVM_ABI void getMetadataToPropagate(Instruction *Inst, SmallVectorImpl< std::pair< unsigned, MDNode * > > &Metadata)
Add metadata from Inst to Metadata, if it can be preserved after vectorization.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
LLVM_ABI MDNode * intersectAccessGroups(const Instruction *Inst1, const Instruction *Inst2)
Compute the access-group list of access groups that Inst1 and Inst2 are both in.
unsigned M1(unsigned Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
constexpr unsigned MaxAnalysisRecursionDepth
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.interleaveN intrinsics.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI bool maskIsAllOneOrUndef(Value *Mask)
Given a mask vector of i1, Return true if all of the elements of this predicate mask are known to be ...
constexpr int PoisonMaskElem
LLVM_ABI bool isValidAsAccessGroup(MDNode *AccGroup)
Return whether an MDNode might represent an access group.
LLVM_ABI Intrinsic::ID getIntrinsicForCallSite(const CallBase &CB, const TargetLibraryInfo *TLI)
Map a call instruction to an intrinsic ID.
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &PtrToStride, Value *Ptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
LLVM_ABI MDNode * uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2)
Compute the union of two access-group lists.
unsigned M0(unsigned Val)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
LLVM_ABI bool maskIsAllZeroOrUndef(Value *Mask)
Given a mask vector of i1, Return true if all of the elements of this predicate mask are known to be ...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI void getShuffleMaskWithWidestElts(ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Repetitively apply widenShuffleMaskElts() for as long as it succeeds, to get the shuffle mask with wi...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
LLVM_ABI bool maskContainsAllOneOrUndef(Value *Mask)
Given a mask vector of i1, Return true if any of the elements of this predicate mask are known to be ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.