1//===----------------------- SIFrameLowering.cpp --------------------------===//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//==-----------------------------------------------------------------------===//
22 #define DEBUG_TYPE "frame-info"
25 "amdgpu-spill-vgpr-to-agpr",
26 cl::desc(
"Enable spilling VGPRs to AGPRs"),
30// Find a register matching \p RC from \p LiveUnits which is unused and
31// available throughout the function. On failure, returns AMDGPU::NoRegister.
32// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
33// MCRegisters. This should reduce the number of iterations and avoid redundant
46// Find a scratch register that we can use in the prologue. We avoid using
47// callee-save registers since they may appear to be free when this is called
48// from canUseAsPrologue (during shrink wrapping), but then no longer be free
49// when this is called from emitPrologue.
53 // Mark callee saved registers as used so we will not choose them.
55 for (
unsigned i = 0; CSRegs[i]; ++i)
56 LiveUnits.
addReg(CSRegs[i]);
58 // We are looking for a register that can be used throughout the entire
59 // function, so any use is unacceptable.
71/// Query target location for spilling SGPRs
72/// \p IncludeScratchCopy : Also look for free scratch SGPRs
76 bool IncludeScratchCopy =
true) {
82 unsigned Size =
TRI->getSpillSize(RC);
83 Align Alignment =
TRI->getSpillAlign(RC);
85 // We need to save and restore the given SGPR.
88 // 1: Try to save the given register into an unused scratch SGPR. The
89 // LiveUnits should have all the callee saved registers marked as used. For
90 // certain cases we skip copy to scratch SGPR.
91 if (IncludeScratchCopy)
95 int FI = FrameInfo.CreateStackObject(
Size, Alignment,
true,
nullptr,
98 if (
TRI->spillSGPRToVGPR() &&
100 /*IsPrologEpilog=*/true)) {
101 // 2: There's no free lane to spill, and no free register to save the
102 // SGPR, so we're forced to take another VGPR to use for the spill.
112 // Remove dead <FI> index
114 // 3: If all else fails, spill the register to memory.
115 FI = FrameInfo.CreateSpillStackObject(
Size, Alignment);
126 LiveUnits.
addReg(ScratchSGPR);
132// We need to specially emit stack operations here because a different frame
133// register is used than in the rest of the function, as getFrameRegister would
141 int64_t DwordOff = 0) {
142 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
143 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
150 LiveUnits.
addReg(SpillReg);
151 bool IsKill = !
MBB.isLiveIn(SpillReg);
152 TRI.buildSpillLoadStore(
MBB,
I,
DL,
Opc, FI, SpillReg, IsKill, FrameReg,
153 DwordOff, MMO,
nullptr, &LiveUnits);
165 Register FrameReg, int64_t DwordOff = 0) {
166 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
167 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
174 TRI.buildSpillLoadStore(
MBB,
I,
DL,
Opc, FI, SpillReg,
false, FrameReg,
175 DwordOff, MMO,
nullptr, &LiveUnits);
185 Register TargetLo =
TRI->getSubReg(TargetReg, AMDGPU::sub0);
186 Register TargetHi =
TRI->getSubReg(TargetReg, AMDGPU::sub1);
193 const MCInstrDesc &GetPC64 =
TII->get(AMDGPU::S_GETPC_B64_pseudo);
198 MBB.addLiveIn(GitPtrLo);
207 if (LiveUnits.
empty()) {
221// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
222// BP, etc. These spills are delayed until the current function's frame is
223// finalized. For a given register, the builder uses the
224// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
241 unsigned EltSize = 4;
243 void saveToMemory(
const int FI)
const {
245 assert(!MFI.isDeadObjectIndex(FI));
247 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI,
/*IsProlog*/ true);
250 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
254 for (
unsigned I = 0, DwordOff = 0;
I < NumSubRegs; ++
I) {
257 :
Register(TRI.getSubReg(SuperReg, SplitParts[
I]));
258 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
262 FI, FrameReg, DwordOff);
267 void saveToVGPRLane(
const int FI)
const {
268 assert(!MFI.isDeadObjectIndex(FI));
272 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
273 assert(Spill.size() == NumSubRegs);
275 for (
unsigned I = 0;
I < NumSubRegs; ++
I) {
278 :
Register(TRI.getSubReg(SuperReg, SplitParts[
I]));
279 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
287 void copyToScratchSGPR(
Register DstReg)
const {
288 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
293 void restoreFromMemory(
const int FI) {
296 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI,
/*IsProlog*/ false);
298 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
302 for (
unsigned I = 0, DwordOff = 0;
I < NumSubRegs; ++
I) {
305 :
Register(TRI.getSubReg(SuperReg, SplitParts[
I]));
308 TmpVGPR, FI, FrameReg, DwordOff);
311 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
SubReg)
317 void restoreFromVGPRLane(
const int FI) {
320 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
321 assert(Spill.size() == NumSubRegs);
323 for (
unsigned I = 0;
I < NumSubRegs; ++
I) {
326 :
Register(TRI.getSubReg(SuperReg, SplitParts[
I]));
327 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR),
SubReg)
333 void copyFromScratchSGPR(
Register SrcReg)
const {
334 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
347 : MI(MI), MBB(MBB), MF(*MBB.
getParent()),
348 ST(MF.getSubtarget<
GCNSubtarget>()), MFI(MF.getFrameInfo()),
350 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
353 SplitParts = TRI.getRegSplitParts(RC, EltSize);
354 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
356 assert(SuperReg != AMDGPU::M0 &&
"m0 should never spill");
360 switch (SI.getKind()) {
362 return saveToMemory(SI.getIndex());
364 return saveToVGPRLane(SI.getIndex());
366 return copyToScratchSGPR(SI.getReg());
371 switch (SI.getKind()) {
373 return restoreFromMemory(SI.getIndex());
375 return restoreFromVGPRLane(SI.getIndex());
377 return copyFromScratchSGPR(SI.getReg());
384// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
385void SIFrameLowering::emitEntryFunctionFlatScratchInit(
389 const SIInstrInfo *
TII =
ST.getInstrInfo();
390 const SIRegisterInfo *
TRI = &
TII->getRegisterInfo();
391 const SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
393 // We don't need this if we only have spills since there is no user facing
396 // TODO: If we know we don't have flat instructions earlier, we can omit
397 // this from the input registers.
399 // TODO: We only need to know if we access scratch space through a flat
400 // pointer. Because we only detect if flat instructions are used at all,
401 // this will be used more often than necessary on VI.
406 if (
ST.isAmdPalOS()) {
407 // Extract the scratch offset from the descriptor in the GIT
408 LiveRegUnits LiveUnits;
412 // Find unused reg to load flat scratch init into
414 Register FlatScrInit = AMDGPU::NoRegister;
417 AllSGPR64s = AllSGPR64s.
slice(
418 std::min(
static_cast<unsigned>(AllSGPR64s.
size()), NumPreloaded));
422 MRI.isAllocatable(
Reg) && !
TRI->isSubRegisterEq(
Reg, GITPtrLoReg)) {
427 assert(FlatScrInit &&
"Failed to find free register for scratch init");
429 FlatScrInitLo =
TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
430 FlatScrInitHi =
TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
434 // We now have the GIT ptr - now get the scratch descriptor from the entry
435 // at offset 0 (or offset 16 for a compute shader).
437 const MCInstrDesc &LoadDwordX2 =
TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
445 const GCNSubtarget &Subtarget = MF.
getSubtarget<GCNSubtarget>();
449 .
addImm(EncodedOffset)
// offset
453 // Mask the offset in [47:0] of the descriptor
454 const MCInstrDesc &SAndB32 =
TII->get(AMDGPU::S_AND_B32);
462 assert(FlatScratchInitReg);
465 MRI.addLiveIn(FlatScratchInitReg);
468 FlatScrInitLo =
TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
469 FlatScrInitHi =
TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
472 // Do a 64-bit pointer add.
473 if (
ST.flatScratchIsPointer()) {
477 .
addReg(ScratchWaveOffsetReg);
484 using namespace AMDGPU::Hwreg;
487 .
addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
490 .
addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
497 .
addReg(ScratchWaveOffsetReg);
509 // Copy the size in bytes.
513 // Add wave offset in bytes to private base offset.
514 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
517 .
addReg(ScratchWaveOffsetReg);
519 // Convert offset to 256-byte units.
527// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
528// memory. They should have been removed by now.
539// Shift down registers reserved for the scratch RSRC.
540Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
544 const SIInstrInfo *
TII =
ST.getInstrInfo();
545 const SIRegisterInfo *
TRI = &
TII->getRegisterInfo();
547 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
553 if (!ScratchRsrcReg || (!
MRI.isPhysRegUsed(ScratchRsrcReg) &&
557 if (
ST.hasSGPRInitBug() ||
558 ScratchRsrcReg !=
TRI->reservedPrivateSegmentBufferReg(MF))
559 return ScratchRsrcReg;
561 // We reserved the last registers for this. Shift it down to the end of those
562 // which were actually used.
564 // FIXME: It might be safer to use a pseudoregister before replacement.
566 // FIXME: We should be able to eliminate unused input registers. We only
567 // cannot do this for the resources required for scratch access. For now we
568 // skip over user SGPRs and may leave unused holes.
572 AllSGPR128s = AllSGPR128s.
slice(std::min(
static_cast<unsigned>(AllSGPR128s.
size()), NumPreloaded));
574 // Skip the last N reserved elements because they should have already been
575 // reserved for VCC etc.
578 // Pick the first unallocated one. Make sure we don't clobber the other
579 // reserved input we needed. Also for PAL, make sure we don't clobber
580 // the GIT pointer passed in SGPR0 or SGPR8.
581 if (!
MRI.isPhysRegUsed(
Reg) &&
MRI.isAllocatable(
Reg) &&
582 (!GITPtrLoReg || !
TRI->isSubRegisterEq(
Reg, GITPtrLoReg))) {
583 MRI.replaceRegWith(ScratchRsrcReg,
Reg);
590 return ScratchRsrcReg;
594 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
599 assert(&MF.
front() == &
MBB &&
"Shrink-wrapping not yet supported");
601 // FIXME: If we only have SGPR spills, we won't actually be using scratch
602 // memory since these spill to VGPRs. We should be cleaning up these unused
603 // SGPR spill frame indices somewhere.
605 // FIXME: We still have implicit uses on SGPR spill instructions in case they
606 // need to spill to vector memory. It's likely that will not happen, but at
607 // this point it appears we need the setup. This part of the prolog should be
608 // emitted after frame indices are eliminated.
610 // FIXME: Remove all of the isPhysRegUsed checks
625 // We need to do the replacement of the private segment buffer register even
626 // if there are no stack objects. There could be stores to undef or a
627 // constant without an associated object.
629 // This will return `Register()` in cases where there are no actual
630 // uses of the SRSRC.
632 if (!ST.enableFlatScratch())
633 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
635 // Make the selected register live throughout the function.
636 if (ScratchRsrcReg) {
638 if (&OtherBB != &
MBB) {
639 OtherBB.addLiveIn(ScratchRsrcReg);
644 // Now that we have fixed the reserved SRSRC we need to locate the
645 // (potentially) preloaded SRSRC.
647 if (ST.isAmdHsaOrMesa(
F)) {
648 PreloadedScratchRsrcReg =
650 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
651 // We added live-ins during argument lowering, but since they were not
652 // used they were deleted. We're adding the uses now, so add them back.
653 MRI.addLiveIn(PreloadedScratchRsrcReg);
654 MBB.addLiveIn(PreloadedScratchRsrcReg);
658 // Debug location must be unknown since the first debug location is used to
659 // determine the end of the prologue.
663 // We found the SRSRC first because it needs four registers and has an
664 // alignment requirement. If the SRSRC that we found is clobbering with
665 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
666 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
667 // wave offset to a free SGPR.
669 if (PreloadedScratchWaveOffsetReg &&
670 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
673 AllSGPRs = AllSGPRs.
slice(
674 std::min(
static_cast<unsigned>(AllSGPRs.
size()), NumPreloaded));
677 if (!
MRI.isPhysRegUsed(Reg) &&
MRI.isAllocatable(Reg) &&
678 !
TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
679 ScratchWaveOffsetReg = Reg;
686 // FIXME: We can spill incoming arguments and restore at the end of the
688 if (!ScratchWaveOffsetReg)
690 "could not find temporary scratch offset register in prolog");
692 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
694 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
710 // We need to check if we're on a compute queue - if we are, then the CWSR
711 // trap handler may need to store some VGPRs on the stack. The first VGPR
712 // block is saved separately, so we only need to allocate space for any
713 // additional VGPR blocks used. For now, we will make sure there's enough
714 // room for the theoretical maximum number of VGPRs that can be allocated.
715 // FIXME: Figure out if the shader uses fewer VGPRs in practice.
730 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
731 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
732 // SCC, so we need to check for 0 manually.
739 // If at least one of the constants can be inlined, then we can use
740 // s_cselect. Otherwise, use a mov and cmovk.
743 ST.hasInv2PiInlineImm())) {
755 bool NeedsFlatScratchInit =
757 (
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.
hasCalls() ||
760 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
761 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
762 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
763 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
766 if (NeedsFlatScratchInit) {
767 emitEntryFunctionFlatScratchInit(MF,
MBB,
I,
DL, ScratchWaveOffsetReg);
770 if (ScratchRsrcReg) {
771 emitEntryFunctionScratchRsrcRegSetup(MF,
MBB,
I,
DL,
772 PreloadedScratchRsrcReg,
773 ScratchRsrcReg, ScratchWaveOffsetReg);
777// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
778void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
789 if (ST.isAmdPalOS()) {
790 // The pointer to the GIT is formed from the offset passed in and either
791 // the amdgpu-git-ptr-high function attribute or the top part of the PC
792 Register Rsrc01 =
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
793 Register Rsrc03 =
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
797 // We now have the GIT ptr - now get the scratch descriptor from the entry
798 // at offset 0 (or offset 16 for a compute shader).
800 const MCInstrDesc &LoadDwordX4 =
TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
811 .
addImm(EncodedOffset)
// offset
816 // The driver will always set the SRD for wave 64 (bits 118:117 of
817 // descriptor / bits 22:21 of third sub-reg will be 0b11)
818 // If the shader is actually wave32 we have to modify the const_index_stride
819 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
820 // reason the driver does this is that there can be cases where it presents
821 // 2 shaders with different wave size (e.g. VsFs).
822 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
829 }
else if (
ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
831 const MCInstrDesc &SMovB32 =
TII->get(AMDGPU::S_MOV_B32);
833 Register Rsrc2 =
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
834 Register Rsrc3 =
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
836 // Use relocations to get the pointer, and setup the other bits manually.
837 uint64_t Rsrc23 =
TII->getScratchRsrcWords23();
840 Register Rsrc01 =
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
843 const MCInstrDesc &Mov64 =
TII->get(AMDGPU::S_MOV_B64);
849 const MCInstrDesc &LoadDwordX2 =
TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
868 Register Rsrc0 =
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
869 Register Rsrc1 =
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
887 }
else if (
ST.isAmdHsaOrMesa(Fn)) {
888 assert(PreloadedScratchRsrcReg);
890 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
896 // Add the scratch wave offset into the scratch RSRC.
898 // We only want to update the first 48 bits, which is the base address
899 // pointer, without touching the adjacent 16 bits of flags. We know this add
900 // cannot carry-out from bit 47, otherwise the scratch allocation would be
901 // impossible to fit in the 48-bit global address space.
903 // TODO: Evaluate if it is better to just construct an SRD using the flat
904 // scratch init and some constants rather than update the one we are passed.
905 Register ScratchRsrcSub0 =
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
906 Register ScratchRsrcSub1 =
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
908 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
909 // the kernel body via inreg arguments.
912 .
addReg(ScratchWaveOffsetReg)
914 auto Addc =
BuildMI(
MBB,
I,
DL,
TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
935// Activate only the inactive lanes when \p EnableInactiveLanes is true.
936// Otherwise, activate all lanes. It returns the saved exec.
942 bool EnableInactiveLanes) {
953 // Whole wave functions already have a copy of the original EXEC mask that
955 assert(IsProlog &&
"Epilog should look at return, not setup");
957 TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg();
958 assert(ScratchExecCopy &&
"Couldn't find copy of EXEC");
961 MRI, LiveUnits, *
TRI.getWaveMaskRegClass());
964 if (!ScratchExecCopy)
967 LiveUnits.
addReg(ScratchExecCopy);
969 const unsigned SaveExecOpc =
970 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
971 : AMDGPU::S_OR_SAVEEXEC_B32)
972 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
973 : AMDGPU::S_OR_SAVEEXEC_B64);
978 return ScratchExecCopy;
992 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
993 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
994 // might end up flipping the EXEC bits twice.
998 if (!WWMScratchRegs.
empty())
1001 /*IsProlog*/ true,
/*EnableInactiveLanes*/ true);
1003 auto StoreWWMRegisters =
1005 for (
const auto &Reg : WWMRegs) {
1007 int FI = Reg.second;
1009 VGPR, FI, FrameReg);
1014 if (!
MRI.isReserved(Reg)) {
1019 StoreWWMRegisters(WWMScratchRegs);
1021 auto EnableAllLanes = [&]() {
1025 if (!WWMCalleeSavedRegs.
empty()) {
1026 if (ScratchExecCopy) {
1031 /*EnableInactiveLanes*/ false);
1035 StoreWWMRegisters(WWMCalleeSavedRegs);
1037 // If we have already saved some WWM CSR registers, then the EXEC is already
1038 // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here.
1039 if (!ScratchExecCopy)
1041 /*EnableInactiveLanes*/ true);
1042 else if (WWMCalleeSavedRegs.
empty())
1044 }
else if (ScratchExecCopy) {
1045 // FIXME: Split block and make terminator.
1048 LiveUnits.
addReg(ScratchExecCopy);
1054 // Special handle FP spill:
1055 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
1056 // Otherwise, FP has been moved to a temporary register and spill it
1059 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1064 LiveUnits, FrameReg);
1068 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
1069 // such scratch registers live throughout the function.
1072 if (!ScratchSGPRs.
empty()) {
1077 MBB.sortUniqueLiveIns();
1079 if (!LiveUnits.
empty()) {
1098 // Special handle FP restore:
1099 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1100 // the FP value to a temporary register. The frame pointer should be
1101 // overwritten only at the end when all other spills are restored from
1104 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1109 LiveUnits, FrameReg);
1113 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1114 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1115 // this, we might end up flipping the EXEC bits twice.
1119 auto RestoreWWMRegisters =
1121 for (
const auto &Reg : WWMRegs) {
1123 int FI = Reg.second;
1125 VGPR, FI, FrameReg);
1130 // For whole wave functions, the EXEC is already -1 at this point.
1131 // Therefore, we can restore the CSR WWM registers right away.
1132 RestoreWWMRegisters(WWMCalleeSavedRegs);
1134 // The original EXEC is the first operand of the return instruction.
1136 unsigned Opcode = Return.getOpcode();
1138 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
1139 Opcode = AMDGPU::SI_RETURN;
1141 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
1142 Opcode = AMDGPU::SI_TCRETURN_GFX;
1147 Register OrigExec = Return.getOperand(0).getReg();
1149 if (!WWMScratchRegs.
empty()) {
1153 RestoreWWMRegisters(WWMScratchRegs);
1156 // Restore original EXEC.
1159 // Drop the first operand and update the opcode.
1160 Return.removeOperand(0);
1161 Return.setDesc(
TII->get(Opcode));
1166 if (!WWMScratchRegs.
empty()) {
1169 /*IsProlog=*/false,
/*EnableInactiveLanes=*/true);
1171 RestoreWWMRegisters(WWMScratchRegs);
1172 if (!WWMCalleeSavedRegs.
empty()) {
1173 if (ScratchExecCopy) {
1178 /*EnableInactiveLanes*/ false);
1182 RestoreWWMRegisters(WWMCalleeSavedRegs);
1183 if (ScratchExecCopy) {
1184 // FIXME: Split block and make terminator.
1211 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1212 // to determine the end of the prologue.
1216 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1217 // are free to set one up if they need it.
1220 assert(StackPtrReg != AMDGPU::SP_REG);
1232 if (
TRI.hasStackRealignment(MF))
1236 if (!HasFP && !
hasFP(MF)) {
1237 // Emit the CSR spill stores with SP base register.
1240 FramePtrRegScratchCopy);
1242 // CSR spill stores will use FP as base register.
1243 Register SGPRForFPSaveRestoreCopy =
1247 if (SGPRForFPSaveRestoreCopy) {
1248 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1249 // the extra FP copy needed in the other two cases when FP is spilled to
1250 // memory or to a VGPR lane.
1254 DL,
TII,
TRI, LiveUnits, FramePtrReg);
1256 LiveUnits.
addReg(SGPRForFPSaveRestoreCopy);
1258 // Copy FP into a new scratch register so that its previous value can be
1259 // spilled after setting up the new frame.
1261 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1262 if (!FramePtrRegScratchCopy)
1265 LiveUnits.
addReg(FramePtrRegScratchCopy);
1274 RoundedSize += Alignment;
1275 if (LiveUnits.
empty()) {
1280 // s_add_i32 s33, s32, NumBytes
1281 // s_and_b32 s33, s33, 0b111...0000
1290 And->getOperand(3).setIsDead();
// Mark SCC as dead.
1292 }
else if ((HasFP =
hasFP(MF))) {
1298 // If FP is used, emit the CSR spills with FP base register.
1301 FramePtrRegScratchCopy);
1302 if (FramePtrRegScratchCopy)
1303 LiveUnits.
removeReg(FramePtrRegScratchCopy);
1306 // If we need a base pointer, set it up here. It's whatever the value of
1307 // the stack pointer is at this point. Any variable size objects will be
1308 // allocated after this, so we can still use the base pointer to reference
1309 // the incoming arguments.
1310 if ((HasBP =
TRI.hasBasePointer(MF))) {
1316 if (HasFP && RoundedSize != 0) {
1321 Add->getOperand(3).setIsDead();
// Mark SCC as dead.
1326 assert((!HasFP || FPSaved) &&
1327 "Needed to save FP but didn't save it anywhere");
1329 // If we allow spilling to AGPRs we may have saved FP but then spill
1330 // everything into AGPRs instead of the stack.
1332 "Saved FP but didn't need it");
1336 assert((!HasBP || BPSaved) &&
1337 "Needed to save BP but didn't save it anywhere");
1339 assert((HasBP || !BPSaved) &&
"Saved BP but didn't need it");
1342 // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose.
1343 TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
1358 // Get the insert location for the epilogue. If there were no terminators in
1359 // the block, get the last instruction.
1363 MBBI =
MBB.getLastNonDebugInstr();
1365 DL =
MBBI->getDebugLoc();
1367 MBBI =
MBB.getFirstTerminator();
1379 if (RoundedSize != 0) {
1380 if (
TRI.hasBasePointer(MF)) {
1384 }
else if (
hasFP(MF)) {
1392 Register SGPRForFPSaveRestoreCopy =
1395 // CSR spill restores should use FP as base register. If
1396 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1397 // into a new scratch register and copy to FP later when other registers are
1398 // restored from the current stack frame.
1400 if (SGPRForFPSaveRestoreCopy) {
1401 LiveUnits.
addReg(SGPRForFPSaveRestoreCopy);
1404 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1405 if (!FramePtrRegScratchCopy)
1408 LiveUnits.
addReg(FramePtrRegScratchCopy);
1412 FramePtrRegScratchCopy);
1416 // Insert the copy to restore FP.
1417 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1418 : FramePtrRegScratchCopy;
1422 if (SGPRForFPSaveRestoreCopy)
1425 // Insert the CSR spill restores with SP as the base register.
1428 FramePtrRegScratchCopy);
1469 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->
hasSpilledVGPRs()
1472 if (SpillVGPRToAGPR) {
1473 // To track the spill frame indices handled in this pass.
1477 bool SeenDbgInstr =
false;
1482 if (
MI.isDebugInstr())
1483 SeenDbgInstr =
true;
1485 if (
TII->isVGPRSpill(
MI)) {
1486 // Try to eliminate stack used by VGPR spills before frame
1488 unsigned FIOp = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
1489 AMDGPU::OpName::vaddr);
1490 int FI =
MI.getOperand(FIOp).getIndex();
1492 TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)->getReg();
1494 TRI->isAGPR(
MRI, VReg))) {
1496 RS->enterBasicBlockEnd(
MBB);
1497 RS->backward(std::next(
MI.getIterator()));
1498 TRI->eliminateFrameIndex(
MI, 0, FIOp, RS);
1502 }
else if (
TII->isStoreToStackSlot(
MI, FrameIndex) ||
1503 TII->isLoadFromStackSlot(
MI, FrameIndex))
1505 NonVGPRSpillFIs.
set(FrameIndex);
1509 // Stack slot coloring may assign different objects to the same stack slot.
1510 // If not, then the VGPR to AGPR spill slot is dead.
1511 for (
unsigned FI : SpillFIs.
set_bits())
1512 if (!NonVGPRSpillFIs.
test(FI))
1522 MBB.sortUniqueLiveIns();
1524 if (!SpillFIs.
empty() && SeenDbgInstr) {
1525 // FIXME: The dead frame indices are replaced with a null register from
1526 // the debug value instructions. We should instead, update it with the
1527 // correct register value. But not sure the register value alone is
1529 if (
MI.isDebugValue()) {
1530 uint32_t StackOperandIdx =
MI.isDebugValueList() ? 2 : 0;
1531 if (
MI.getOperand(StackOperandIdx).isFI() &&
1533 MI.getOperand(StackOperandIdx).getIndex()) &&
1534 SpillFIs[
MI.getOperand(StackOperandIdx).getIndex()]) {
1535 MI.getOperand(StackOperandIdx)
1536 .ChangeToRegister(
Register(),
false /*isDef*/);
1544 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1545 // can. Any remaining SGPR spills will go to memory, so move them back to the
1547 bool HaveSGPRToVMemSpill =
1550 "SGPR spill should have been removed in SILowerSGPRSpills");
1552 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1553 // but currently hasNonSpillStackObjects is set only from source
1554 // allocas. Stack temps produced from legalization are not counted currently.
1556 assert(RS &&
"RegScavenger required if spilling");
1558 // Add an emergency spill slot
1561 // If we are spilling SGPRs to memory with a large frame, we may need a
1562 // second VGPR emergency frame index.
1563 if (HaveSGPRToVMemSpill &&
1577 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1578 // On gfx908, we had initially reserved highest available VGPR for AGPR
1579 // copy. Now since we are done with RA, check if there exist an unused VGPR
1580 // which is lower than the eariler reserved VGPR before RA. If one exist,
1581 // use it for AGPR copy instead of one reserved before RA.
1584 TRI->findUnusedRegister(
MRI, &AMDGPU::VGPR_32RegClass, MF);
1585 if (UnusedLowVGPR && (
TRI->getHWRegIndex(UnusedLowVGPR) <
1586 TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1587 // Reserve this newly identified VGPR (for AGPR copy)
1588 // reserved registers should already be frozen at this point
1589 // so we can avoid calling MRI.freezeReservedRegs and just use
1592 MRI.reserveReg(UnusedLowVGPR,
TRI);
1595 // We initally reserved the highest available SGPR pair for long branches
1596 // now, after RA, we shift down to a lower unused one if one exists
1599 TRI->findUnusedRegister(
MRI, &AMDGPU::SGPR_64RegClass, MF);
1600 // If LongBranchReservedReg is null then we didn't find a long branch
1601 // and never reserved a register to begin with so there is nothing to
1602 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1603 // register to use so just keep the original one we set.
1604 if (LongBranchReservedReg && UnusedLowSGPR) {
1606 MRI.reserveReg(UnusedLowSGPR,
TRI);
1610// The special SGPR spills like the one needed for FP, BP or any reserved
1611// registers delayed until frame lowering.
1614 bool NeedExecCopyReservedReg)
const {
1622 // Initially mark callee saved registers as used so we will not choose them
1623 // while looking for scratch SGPRs.
1625 for (
unsigned I = 0; CSRegs[
I]; ++
I)
1631 if (NeedExecCopyReservedReg ||
1632 (ReservedRegForExecCopy &&
1633 MRI.isPhysRegUsed(ReservedRegForExecCopy,
/*SkipRegMaskTest=*/true))) {
1634 MRI.reserveReg(ReservedRegForExecCopy,
TRI);
1636 if (UnusedScratchReg) {
1637 // If found any unused scratch SGPR, reserve the register itself for Exec
1638 // copy and there is no need for any spills in that case.
1640 MRI.replaceRegWith(ReservedRegForExecCopy, UnusedScratchReg);
1641 LiveUnits.
addReg(UnusedScratchReg);
1645 "Re-reserving spill slot for EXEC copy register");
1647 /*IncludeScratchCopy=*/false);
1649 }
else if (ReservedRegForExecCopy) {
1650 // Reset it at this point. There are no whole-wave copies and spills
1655 // hasFP only knows about stack objects that already exist. We're now
1656 // determining the stack slots that will be created, so we have to predict
1657 // them. Stack objects force FP usage with calls.
1659 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1660 // don't want to report it here.
1662 // FIXME: Is this really hasReservedCallFrame?
1663 const bool WillHaveFP =
1667 if (WillHaveFP ||
hasFP(MF)) {
1670 "Re-reserving spill slot for FP");
1674 if (
TRI->hasBasePointer(MF)) {
1677 "Re-reserving spill slot for BP");
1682// Only report VGPRs to generic code.
1688 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1689 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1690 // we don't need to save and restore anything.
1699 bool NeedExecCopyReservedReg =
false;
1704 // TODO: Walking through all MBBs here would be a bad heuristic. Better
1705 // handle them elsewhere.
1706 if (
TII->isWWMRegSpillOpcode(
MI.getOpcode()))
1707 NeedExecCopyReservedReg =
true;
1708 else if (
MI.getOpcode() == AMDGPU::SI_RETURN ||
1709 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1710 MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1712 TII->isChainCallOpcode(
MI.getOpcode()))) {
1713 // We expect all return to be the same size.
1715 (
count_if(
MI.operands(), [](
auto Op) { return Op.isReg(); }) ==
1724 // The shift-back is needed only for the VGPRs used for SGPR spills and they
1725 // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
1726 // reserved registers.
1728 if (
TRI->getRegSizeInBits(*RC) != 32)
1733 sort(SortedWWMVGPRs, std::greater<Register>());
1740 // In practice, all the VGPRs are WWM registers, and we will need to save at
1741 // least their inactive lanes. Add them to WWMReservedRegs.
1742 assert(!NeedExecCopyReservedReg &&
1743 "Whole wave functions can use the reg mapped for their i1 argument");
1745 // FIXME: Be more efficient!
1746 unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
1748 AMDGPU::VGPR_32RegClass.getRegisters().take_front(NumArchVGPRs))
1751 MF.
begin()->addLiveIn(Reg);
1753 MF.
begin()->sortUniqueLiveIns();
1756 // Remove any VGPRs used in the return value because these do not need to be saved.
1757 // This prevents CSR restore from clobbering return VGPRs.
1761 SavedVGPRs.
reset(
Op.getReg());
1765 // Create the stack objects for WWM registers now.
1769 TRI->getSpillAlign(*RC));
1772 // Ignore the SGPRs the default implementation found.
1775 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1776 // In gfx908 there was do AGPR loads and stores and thus spilling also
1777 // require a temporary VGPR.
1778 if (!ST.hasGFX90AInsts())
1783 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1784 // allow the default insertion to handle them.
1786 SavedVGPRs.
reset(Reg.first);
1800 // The SP is specifically managed and we don't want extra spills of it.
1803 const BitVector AllSavedRegs = SavedRegs;
1806 // We have to anticipate introducing CSR VGPR spills or spill of caller
1807 // save VGPR reserved for SGPR spills as we now always create stack entry
1808 // for it, if we don't have any stack objects already, since we require a FP
1809 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1810 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1812 const bool WillHaveFP =
1815 // FP will be specially managed like SP.
1816 if (WillHaveFP ||
hasFP(MF))
1819 // Return address use with return instruction is hidden through the SI_RETURN
1820 // pseudo. Given that and since the IPRA computes actual register usage and
1821 // does not use CSR list, the clobbering of return address by function calls
1822 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1823 // usage collection. This will ensure save/restore of return address happens
1824 // in those scenarios.
1826 Register RetAddrReg =
TRI->getReturnAddressReg(MF);
1828 (FrameInfo.
hasCalls() ||
MRI.isPhysRegModified(RetAddrReg))) {
1829 SavedRegs.
set(
TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1830 SavedRegs.
set(
TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1836 std::vector<CalleeSavedInfo> &CSI,
1837 unsigned &MinCSFrameIndex,
1838 unsigned &MaxCSFrameIndex) {
1846 return A.getReg() <
B.getReg();
1848 "Callee saved registers not sorted");
1851 return !CSI.isSpilledToReg() &&
1852 TRI->getPhysRegBaseClass(CSI.getReg()) == &AMDGPU::VGPR_32RegClass &&
1856 auto CSEnd = CSI.end();
1857 for (
auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
1859 if (!CanUseBlockOps(*CSIt))
1862 // Find all the regs that will fit in a 32-bit mask starting at the current
1863 // reg and build said mask. It should have 1 for every register that's
1864 // included, with the current register as the least significant bit.
1866 CSEnd = std::remove_if(
1868 if (CanUseBlockOps(CSI) && CSI.
getReg() <
Reg + 32) {
1878 TRI->getMatchingSuperReg(
Reg, AMDGPU::sub0, BlockRegClass);
1880 // We couldn't find a super register for the block. This can happen if
1881 // the register we started with is too high (e.g. v232 if the maximum is
1882 // v255). We therefore try to get the last register block and figure out
1883 // the mask from there.
1887 TRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass);
1888 assert(RegBlock &&
TRI->isSubRegister(RegBlock,
Reg) &&
1889 "Couldn't find super register");
1890 int RegDelta =
Reg - LastBlockStart;
1892 "Bad shift amount");
1898 // The stack objects can be a bit smaller than the register block if we know
1899 // some of the high bits of Mask are 0. This may happen often with calling
1900 // conventions where the caller and callee-saved VGPRs are interleaved at
1901 // a small boundary (e.g. 8 or 16).
1903 unsigned BlockSize =
TRI->getSpillSize(*BlockRegClass) - UnusedBits * 4;
1905 MFI.CreateStackObject(
BlockSize,
TRI->getSpillAlign(*BlockRegClass),
1906 /*isSpillSlot=*/true);
1907 if ((
unsigned)FrameIdx < MinCSFrameIndex)
1908 MinCSFrameIndex = FrameIdx;
1909 if ((
unsigned)FrameIdx > MaxCSFrameIndex)
1910 MaxCSFrameIndex = FrameIdx;
1912 CSIt->setFrameIdx(FrameIdx);
1913 CSIt->setReg(RegBlock);
1915 CSI.erase(CSEnd, CSI.end());
1920 std::vector<CalleeSavedInfo> &CSI,
unsigned &MinCSFrameIndex,
1921 unsigned &MaxCSFrameIndex)
const {
1923 return true;
// Early exit if no callee saved registers are modified!
1926 bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
1936 std::vector<CalleeSavedInfo> &CSI)
const {
1938 return true;
// Early exit if no callee saved registers are modified!
1944 Register BasePtrReg = RI->getBaseRegister();
1945 Register SGPRForFPSaveRestoreCopy =
1947 Register SGPRForBPSaveRestoreCopy =
1949 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1952 unsigned NumModifiedRegs = 0;
1954 if (SGPRForFPSaveRestoreCopy)
1956 if (SGPRForBPSaveRestoreCopy)
1959 for (
auto &CS : CSI) {
1960 if (CS.getReg() == FramePtrReg.
asMCReg() && SGPRForFPSaveRestoreCopy) {
1961 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1962 if (--NumModifiedRegs)
1964 }
else if (CS.getReg() == BasePtrReg.
asMCReg() &&
1965 SGPRForBPSaveRestoreCopy) {
1966 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1967 if (--NumModifiedRegs)
1981 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1982 uint64_t MaxOffset = EstStackSize - 1;
1984 // We need the emergency stack slots to be allocated in range of the
1985 // MUBUF/flat scratch immediate offset from the base register, so assign these
1986 // first at the incoming SP position.
1988 // TODO: We could try sorting the objects to find a hole in the first bytes
1989 // rather than allocating as close to possible. This could save a lot of space
1990 // on frames with alignment requirements.
1991 if (ST.enableFlatScratch()) {
1996 if (
TII->isLegalMUBUFImmOffset(MaxOffset))
2008 if (!ST.useVGPRBlockOpsForCSR())
2020 if (!BlockRegClass->contains(Reg) ||
2026 // Build a scratch block store.
2028 int FrameIndex = CS.getFrameIdx();
2037 TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
2047 // Add the register to the liveins. This is necessary because if any of the
2048 // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2049 // then the whole block will be marked as reserved and `updateLiveness` will
2053 MBB.sortUniqueLiveIns();
2063 if (!ST.useVGPRBlockOpsForCSR())
2073 if (!BlockRegClass->
contains(Reg) ||
2079 // Build a scratch block load.
2081 int FrameIndex = CS.getFrameIdx();
2086 MFI.getObjectAlign(FrameIndex));
2089 TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
2097 // Add the register to the liveins. This is necessary because if any of the
2098 // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2099 // then the whole block will be marked as reserved and `updateLiveness` will
2104 MBB.sortUniqueLiveIns();
2112 int64_t Amount =
I->getOperand(0).getImm();
2114 return MBB.erase(
I);
2119 unsigned Opc =
I->getOpcode();
2120 bool IsDestroy =
Opc ==
TII->getCallFrameDestroyOpcode();
2121 uint64_t CalleePopAmount = IsDestroy ?
I->getOperand(1).getImm() : 0;
2135 Add->getOperand(3).setIsDead();
// Mark SCC as dead.
2136 }
else if (CalleePopAmount != 0) {
2140 return MBB.erase(
I);
2143/// Returns true if the frame will require a reference to the stack pointer.
2145/// This is the set of conditions common to setting up the stack pointer in a
2146/// kernel, and for using a frame pointer in a callable function.
2148/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
2154// The FP for kernels is always known 0, so we never really need to setup an
2155// explicit register for it. However, DisableFramePointerElim will force us to
2156// use a register for it.
2160 // For entry & chain functions we can use an immediate offset in most cases,
2161 // so the presence of calls doesn't imply we need a distinct frame pointer.
2165 // All offsets are unsigned, so need to be addressed in the same direction
2168 // FIXME: This function is pretty broken, since it can be called before the
2169 // frame layout is determined or CSR spills are inserted.
2189// This is essentially a reduced version of hasFP for entry functions. Since the
2190// stack pointer is known 0 on entry to kernels, we never really need an FP
2191// register. We may need to initialize the stack pointer depending on the frame
2192// properties, which logically overlaps many of the cases where an ordinary
2193// function would require an FP.
2194// Also used for chain functions. While not technically entry functions, chain
2195// functions may need to set up a stack pointer in some situations.
2198 // Callable functions always require a stack pointer reference.
2201 "only expected to call this for entry points and chain functions");
2205 // Entry points ordinarily don't need to initialize SP. We have to set it up
2206 // for callees if there are any. Also note tail calls are impossible/don't
2207 // make any sense for kernels.
2211 // We still need to initialize the SP if we're doing anything weird that
2212 // references the SP, like variable sized stack objects.
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static constexpr MCPhysReg FPReg
static constexpr MCPhysReg SPReg
This file declares the machine register scavenger class.
static void buildEpilogRestore(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static cl::opt< bool > EnableSpillVGPRToAGPR("amdgpu-spill-vgpr-to-agpr", cl::desc("Enable spilling VGPRs to AGPRs"), cl::ReallyHidden, cl::init(true))
static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, const TargetRegisterClass &RC=AMDGPU::SReg_32_XM0_XEXECRegClass, bool IncludeScratchCopy=true)
Query target location for spilling SGPRs IncludeScratchCopy : Also look for free scratch SGPRs.
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, const SIInstrInfo *TII, Register TargetReg)
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI)
static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool IsProlog, bool EnableInactiveLanes)
static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI)
Returns true if the frame will require a reference to the stack pointer.
static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, const GCNSubtarget &ST, std::vector< CalleeSavedInfo > &CSI, unsigned &MinCSFrameIndex, unsigned &MaxCSFrameIndex)
static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, const SIMachineFunctionInfo *FuncInfo, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsProlog)
static bool allSGPRSpillsAreDead(const MachineFunction &MF)
static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, const TargetRegisterClass &RC, bool Unused=false)
static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, const LiveRegUnits &LiveUnits, const TargetRegisterClass &RC)
static unsigned getScratchScaleFactor(const GCNSubtarget &ST)
static const int BlockSize
bool isChainFunction() const
bool isEntryFunction() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
bool test(unsigned Idx) const
void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
clearBitsNotInMask - Clear a bit in this vector for every '0' bit in Mask.
bool any() const
any - Returns true if any bit is set.
void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
clearBitsInMask - Clear any bits in this vector that are set in Mask.
iterator_range< const_set_bits_iterator > set_bits() const
bool empty() const
empty - Tests whether there are no bits in this bitvector.
The CalleeSavedInfo class tracks the information need to locate where a callee saved register is in t...
MCRegister getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasImplicitBufferPtr() const
bool hasFlatScratchInit() const
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
void init(const TargetRegisterInfo &TRI)
Initialize and clear the set.
void addReg(MCRegister Reg)
Adds register units covered by physical register Reg.
LLVM_ABI void stepBackward(const MachineInstr &MI)
Updates liveness when stepping backwards over the instruction MI.
LLVM_ABI void addLiveOuts(const MachineBasicBlock &MBB)
Adds registers living out of block MBB.
void removeReg(MCRegister Reg)
Removes all register units covered by physical register Reg.
bool empty() const
Returns true if the set is empty.
LLVM_ABI void addLiveIns(const MachineBasicBlock &MBB)
Adds registers living into block MBB.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasVarSizedObjects() const
This method may be called any time after instruction selection is complete to determine if the stack ...
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
bool hasCalls() const
Return true if the current function has any function calls.
bool isFrameAddressTaken() const
This method may be called any time after instruction selection is complete to determine if there is a...
Align getMaxAlign() const
Return the alignment in bytes that this function must be aligned to, which is greater than the defaul...
bool hasPatchPoint() const
This method may be called any time after instruction selection is complete to determine if there is a...
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
bool hasTailCall() const
Returns true if the function contains a tail call.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasStackMap() const
This method may be called any time after instruction selection is complete to determine if there is a...
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getObjectIndexBegin() const
Return the minimum frame object index.
bool isDeadObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a dead object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
void setIsDead(bool Val=true)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
LLVM_ABI bool isPhysRegModified(MCRegister PhysReg, bool SkipNoReturnDef=false) const
Return true if the specified register is modified in this function.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
PrologEpilogSGPRSpillBuilder(Register Reg, const PrologEpilogSGPRSaveRestoreInfo SI, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, const SIInstrInfo *TII, const SIRegisterInfo &TRI, LiveRegUnits &LiveUnits, Register FrameReg)
Wrapper class representing virtual and physical registers.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs, bool NeedExecCopyReservedReg) const
StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override
getFrameIndexReference - This method should return the base register and offset used to reference a f...
void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameFinalized - This method is called immediately before the specified function...
bool mayReserveScratchForCWSR(const MachineFunction &MF) const
bool allocateScavengingFrameIndexesNearIncomingSP(const MachineFunction &MF) const override
Control the placement of special register scavenging spill slots when allocating a stack frame.
bool requiresStackPointerReference(const MachineFunction &MF) const
void emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const override
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const
bool hasFPImpl(const MachineFunction &MF) const override
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef< CalleeSavedInfo > CSI, const TargetRegisterInfo *TRI) const override
spillCalleeSavedRegisters - Issues instruction(s) to spill all callee saved registers and returns tru...
bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector< CalleeSavedInfo > &CSI) const override
void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override
void emitCSRSpillRestores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const
void processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameIndicesReplaced - This method is called immediately before MO_FrameIndex op...
bool isSupportedStackID(TargetStackID::Value ID) const override
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override
emitProlog/emitEpilog - These methods insert prolog and epilog code into the function.
MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
This method is called during prolog/epilog code insertion to eliminate call frame setup and destroy p...
bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, MutableArrayRef< CalleeSavedInfo > CSI, const TargetRegisterInfo *TRI) const override
restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee saved registers and returns...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< PrologEpilogSGPRSpill > getPrologEpilogSGPRSpills() const
const WWMSpillsMap & getWWMSpills() const
void getAllScratchSGPRCopyDstRegs(SmallVectorImpl< Register > &Regs) const
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
void setSGPRForEXECCopy(Register Reg)
unsigned getNumPreloadedSGPRs() const
void shiftWwmVGPRsToLowestRange(MachineFunction &MF, SmallVectorImpl< Register > &WWMVGPRs, BitVector &SavedVGPRs)
void setMaskForVGPRBlockOps(Register RegisterBlock, uint32_t Mask)
GCNUserSGPRUsageInfo & getUserSGPRInfo()
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
Register getLongBranchReservedReg() const
unsigned getDynamicVGPRBlockSize() const
bool hasSpilledVGPRs() const
void setVGPRToAGPRSpillDead(int FrameIndex)
bool isWholeWaveFunction() const
Register getStackPtrOffsetReg() const
bool isStackRealigned() const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
bool hasMaskForVGPRBlockOps(Register RegisterBlock) const
bool hasPrologEpilogSGPRSpillEntry(Register Reg) const
Register getGITPtrLoReg(const MachineFunction &MF) const
void setVGPRForAGPRCopy(Register NewVGPRForAGPRCopy)
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
Register getSGPRForEXECCopy() const
bool isWWMReservedRegister(Register Reg) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
Register getVGPRForAGPRCopy() const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
Register getFrameOffsetReg() const
void setLongBranchReservedReg(Register Reg)
void setHasSpilledVGPRs(bool Spill=true)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
void setScratchReservedForDynamicVGPRs(unsigned SizeInBytes)
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
const ReservedRegSet & getWWMReservedRegs() const
Register getImplicitBufferPtrUserSGPR() const
const PrologEpilogSGPRSaveRestoreInfo & getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const
void setIsStackRealigned(bool Realigned=true)
unsigned getGITPtrHigh() const
bool hasSpilledSGPRs() const
void addToPrologEpilogSGPRSpills(Register Reg, PrologEpilogSGPRSaveRestoreInfo SI)
Register getScratchSGPRCopyDstReg(Register Reg) const
void setScratchRSrcReg(Register Reg)
void reserveWWMRegister(Register Reg)
Register getFrameRegister(const MachineFunction &MF) const override
const TargetRegisterClass * getRegClassForBlockOp(const MachineFunction &MF) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
int64_t getFixed() const
Returns the fixed component of the stack.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
virtual bool hasReservedCallFrame(const MachineFunction &MF) const
hasReservedCallFrame - Under normal circumstances, when a frame pointer is not required,...
virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
void restoreCalleeSavedRegister(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const CalleeSavedInfo &CS, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const
void spillCalleeSavedRegister(MachineBasicBlock &SaveBlock, MachineBasicBlock::iterator MI, const CalleeSavedInfo &CS, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const
spillCalleeSavedRegister - Default implementation for spilling a single callee saved register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
LLVM_ABI bool DisableFramePointerElim(const MachineFunction &MF) const
DisableFramePointerElim - This returns true if frame pointer elimination optimization should be disab...
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ ScalablePredicateVector
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
@ And
Bitwise or logical AND of integers.
unsigned getKillRegState(bool B)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
@ PRIVATE_SEGMENT_WAVE_BYTE_OFFSET
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.