Successfully identified regression in *llvm* in CI configuration tcwg_bmk_llvm_tx1/llvm-master-aarch64-spec2k6-O2_LTO. So far, this commit has regressed CI configurations:
- tcwg_bmk_llvm_tx1/llvm-master-aarch64-spec2k6-O2_LTO
Culprit:
<cut>
commit 643ce61fb3c2c730b7ecead4a489eaeef3f053ea
Author: Akira Hatanaka <ahatanaka(a)apple.com>
Date: Wed Aug 11 12:55:28 2021 -0700
[ObjC][ARC] Don't form a StoreStrong call if it is unsafe to move the
release call
findSafeStoreForStoreStrongContraction checks whether it's safe to move
the release call to the store by inspecting all instructions between the
two, but was ignoring retain instructions. This was causing objects to
be released and deallocated before they were retained.
rdar://81668577
</cut>
Results regressed to (for first_bad == 643ce61fb3c2c730b7ecead4a489eaeef3f053ea)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_abe stage1 -- --set gcc_override_configure=--disable-libsanitizer:
-8
# build_abe linux:
-7
# build_abe glibc:
-6
# build_abe stage2 -- --set gcc_override_configure=--disable-libsanitizer:
-5
# build_llvm true:
-3
# true:
0
# benchmark -- -O2_LTO artifacts/build-643ce61fb3c2c730b7ecead4a489eaeef3f053ea/results_id:
1
# 433.milc,milc_base.default regressed by 103
from (for last_good == 767496d19cb9a1fbba57ff08095faa161998ee36)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_abe stage1 -- --set gcc_override_configure=--disable-libsanitizer:
-8
# build_abe linux:
-7
# build_abe glibc:
-6
# build_abe stage2 -- --set gcc_override_configure=--disable-libsanitizer:
-5
# build_llvm true:
-3
# true:
0
# benchmark -- -O2_LTO artifacts/build-767496d19cb9a1fbba57ff08095faa161998ee36/results_id:
1
Artifacts of last_good build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-…
Results ID of last_good: tx1_64/tcwg_bmk_llvm_tx1/bisect-llvm-master-aarch64-spec2k6-O2_LTO/4172
Artifacts of first_bad build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-…
Results ID of first_bad: tx1_64/tcwg_bmk_llvm_tx1/bisect-llvm-master-aarch64-spec2k6-O2_LTO/4171
Build top page/logs: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-…
Configuration details:
Reproduce builds:
<cut>
mkdir investigate-llvm-643ce61fb3c2c730b7ecead4a489eaeef3f053ea
cd investigate-llvm-643ce61fb3c2c730b7ecead4a489eaeef3f053ea
git clone https://git.linaro.org/toolchain/jenkins-scripts
mkdir -p artifacts/manifests
curl -o artifacts/manifests/build-baseline.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-… --fail
curl -o artifacts/manifests/build-parameters.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-… --fail
curl -o artifacts/test.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-… --fail
chmod +x artifacts/test.sh
# Reproduce the baseline build (build all pre-requisites)
./jenkins-scripts/tcwg_bmk-build.sh @@ artifacts/manifests/build-baseline.sh
# Save baseline build state (which is then restored in artifacts/test.sh)
mkdir -p ./bisect
rsync -a --del --delete-excluded --exclude /bisect/ --exclude /artifacts/ --exclude /llvm/ ./ ./bisect/baseline/
cd llvm
# Reproduce first_bad build
git checkout --detach 643ce61fb3c2c730b7ecead4a489eaeef3f053ea
../artifacts/test.sh
# Reproduce last_good build
git checkout --detach 767496d19cb9a1fbba57ff08095faa161998ee36
../artifacts/test.sh
cd ..
</cut>
History of pending regressions and results: https://git.linaro.org/toolchain/ci/base-artifacts.git/log/?h=linaro-local/…
Artifacts: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-…
Build log: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-…
Full commit (up to 1000 lines):
<cut>
commit 643ce61fb3c2c730b7ecead4a489eaeef3f053ea
Author: Akira Hatanaka <ahatanaka(a)apple.com>
Date: Wed Aug 11 12:55:28 2021 -0700
[ObjC][ARC] Don't form a StoreStrong call if it is unsafe to move the
release call
findSafeStoreForStoreStrongContraction checks whether it's safe to move
the release call to the store by inspecting all instructions between the
two, but was ignoring retain instructions. This was causing objects to
be released and deallocated before they were retained.
rdar://81668577
---
llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp | 21 ++++++++++++---------
.../test/Transforms/ObjCARC/contract-storestrong.ll | 19 +++++++++++++++++++
2 files changed, 31 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 62161b5b6b40..577973c80601 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -226,13 +226,6 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
// of Inst.
ARCInstKind Class = GetBasicARCInstKind(Inst);
- // If Inst is an unrelated retain, we don't care about it.
- //
- // TODO: This is one area where the optimization could be made more
- // aggressive.
- if (IsRetain(Class))
- continue;
-
// If we have seen the store, but not the release...
if (Store) {
// We need to make sure that it is safe to move the release from its
@@ -248,8 +241,18 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
return nullptr;
}
- // Ok, now we know we have not seen a store yet. See if Inst can write to
- // our load location, if it can not, just ignore the instruction.
+ // Ok, now we know we have not seen a store yet.
+
+ // If Inst is a retain, we don't care about it as it doesn't prevent moving
+ // the load to the store.
+ //
+ // TODO: This is one area where the optimization could be made more
+ // aggressive.
+ if (IsRetain(Class))
+ continue;
+
+ // See if Inst can write to our load location, if it can not, just ignore
+ // the instruction.
if (!isModSet(AA->getModRefInfo(Inst, Loc)))
continue;
diff --git a/llvm/test/Transforms/ObjCARC/contract-storestrong.ll b/llvm/test/Transforms/ObjCARC/contract-storestrong.ll
index eff0a6fdf900..9c45e3334d83 100644
--- a/llvm/test/Transforms/ObjCARC/contract-storestrong.ll
+++ b/llvm/test/Transforms/ObjCARC/contract-storestrong.ll
@@ -256,6 +256,25 @@ define i8* @test13(i8* %a0, i8* %a1, i8** %addr, i8* %new) {
ret i8* %retained
}
+; Cannot form a storeStrong call because it's unsafe to move the release call to
+; the store.
+
+; CHECK-LABEL: define void @test14(
+; CHECK: %[[V0:.*]] = load i8*, i8** %a
+; CHECK: %[[V1:.*]] = call i8* @llvm.objc.retain(i8* %p)
+; CHECK: store i8* %[[V1]], i8** %a
+; CHECK: %[[V2:.*]] = call i8* @llvm.objc.retain(i8* %[[V0]])
+; CHECK: call void @llvm.objc.release(i8* %[[V2]])
+
+define void @test14(i8** %a, i8* %p) {
+ %v0 = load i8*, i8** %a, align 8
+ %v1 = call i8* @llvm.objc.retain(i8* %p)
+ store i8* %p, i8** %a, align 8
+ %v2 = call i8* @llvm.objc.retain(i8* %v0)
+ call void @llvm.objc.release(i8* %v0)
+ ret void
+}
+
!0 = !{}
; CHECK: attributes [[NUW]] = { nounwind }
</cut>
Successfully identified regression in *llvm* in CI configuration tcwg_bmk_llvm_tk1/llvm-release-arm-spec2k6-O2_LTO. So far, this commit has regressed CI configurations:
- tcwg_bmk_llvm_tk1/llvm-release-arm-spec2k6-O2_LTO
Culprit:
<cut>
commit 176379e0c8f9dbde2b357fb3b6a6802b83282e71
Author: Alex Zinenko <zinenko(a)google.com>
Date: Fri Feb 12 12:53:27 2021 +0100
[mlir] Use the interface-based translation for LLVM "intrinsic" dialects
Port the translation of five dialects that define LLVM IR intrinsics
(LLVMAVX512, LLVMArmNeon, LLVMArmSVE, NVVM, ROCDL) to the new dialect
interface-based mechanism. This allows us to remove individual translations
that were created for each of these dialects and just use one common
MLIR-to-LLVM-IR translation that potentially supports all dialects instead,
based on what is registered and including any combination of translatable
dialects. This removal was one of the main goals of the refactoring.
To support the addition of GPU-related metadata, the translation interface is
extended with the `amendOperation` function that allows the interface
implementation to post-process any translated operation with dialect attributes
from the dialect for which the interface is implemented regardless of the
operation's dialect. This is currently applied to "kernel" functions, but can
be used to construct other metadata in dialect-specific ways without
necessarily affecting operations.
Depends On D96591, D96504
Reviewed By: nicolasvasilache
Differential Revision: https://reviews.llvm.org/D96592
</cut>
Results regressed to (for first_bad == 176379e0c8f9dbde2b357fb3b6a6802b83282e71)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_abe stage1 -- --set gcc_override_configure=--with-mode=arm --set gcc_override_configure=--disable-libsanitizer:
-8
# build_abe linux:
-7
# build_abe glibc:
-6
# build_abe stage2 -- --set gcc_override_configure=--with-mode=arm --set gcc_override_configure=--disable-libsanitizer:
-5
# build_llvm true:
-3
# true:
0
# benchmark -- -O2_LTO_marm artifacts/build-176379e0c8f9dbde2b357fb3b6a6802b83282e71/results_id:
1
# 482.sphinx3,sphinx_livepretend_base.default regressed by 109
from (for last_good == 2d728bbff5c688284b8b8306ecfd3000b0ab8bb1)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_abe stage1 -- --set gcc_override_configure=--with-mode=arm --set gcc_override_configure=--disable-libsanitizer:
-8
# build_abe linux:
-7
# build_abe glibc:
-6
# build_abe stage2 -- --set gcc_override_configure=--with-mode=arm --set gcc_override_configure=--disable-libsanitizer:
-5
# build_llvm true:
-3
# true:
0
# benchmark -- -O2_LTO_marm artifacts/build-2d728bbff5c688284b8b8306ecfd3000b0ab8bb1/results_id:
1
Artifacts of last_good build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release…
Results ID of last_good: tk1_32/tcwg_bmk_llvm_tk1/bisect-llvm-release-arm-spec2k6-O2_LTO/4128
Artifacts of first_bad build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release…
Results ID of first_bad: tk1_32/tcwg_bmk_llvm_tk1/bisect-llvm-release-arm-spec2k6-O2_LTO/4125
Build top page/logs: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release…
Configuration details:
Reproduce builds:
<cut>
mkdir investigate-llvm-176379e0c8f9dbde2b357fb3b6a6802b83282e71
cd investigate-llvm-176379e0c8f9dbde2b357fb3b6a6802b83282e71
git clone https://git.linaro.org/toolchain/jenkins-scripts
mkdir -p artifacts/manifests
curl -o artifacts/manifests/build-baseline.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release… --fail
curl -o artifacts/manifests/build-parameters.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release… --fail
curl -o artifacts/test.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release… --fail
chmod +x artifacts/test.sh
# Reproduce the baseline build (build all pre-requisites)
./jenkins-scripts/tcwg_bmk-build.sh @@ artifacts/manifests/build-baseline.sh
# Save baseline build state (which is then restored in artifacts/test.sh)
mkdir -p ./bisect
rsync -a --del --delete-excluded --exclude /bisect/ --exclude /artifacts/ --exclude /llvm/ ./ ./bisect/baseline/
cd llvm
# Reproduce first_bad build
git checkout --detach 176379e0c8f9dbde2b357fb3b6a6802b83282e71
../artifacts/test.sh
# Reproduce last_good build
git checkout --detach 2d728bbff5c688284b8b8306ecfd3000b0ab8bb1
../artifacts/test.sh
cd ..
</cut>
History of pending regressions and results: https://git.linaro.org/toolchain/ci/base-artifacts.git/log/?h=linaro-local/…
Artifacts: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release…
Build log: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release…
Full commit (up to 1000 lines):
<cut>
commit 176379e0c8f9dbde2b357fb3b6a6802b83282e71
Author: Alex Zinenko <zinenko(a)google.com>
Date: Fri Feb 12 12:53:27 2021 +0100
[mlir] Use the interface-based translation for LLVM "intrinsic" dialects
Port the translation of five dialects that define LLVM IR intrinsics
(LLVMAVX512, LLVMArmNeon, LLVMArmSVE, NVVM, ROCDL) to the new dialect
interface-based mechanism. This allows us to remove individual translations
that were created for each of these dialects and just use one common
MLIR-to-LLVM-IR translation that potentially supports all dialects instead,
based on what is registered and including any combination of translatable
dialects. This removal was one of the main goals of the refactoring.
To support the addition of GPU-related metadata, the translation interface is
extended with the `amendOperation` function that allows the interface
implementation to post-process any translated operation with dialect attributes
from the dialect for which the interface is implemented regardless of the
operation's dialect. This is currently applied to "kernel" functions, but can
be used to construct other metadata in dialect-specific ways without
necessarily affecting operations.
Depends On D96591, D96504
Reviewed By: nicolasvasilache
Differential Revision: https://reviews.llvm.org/D96592
---
.../test/Standalone/standalone-translate.mlir | 3 -
mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td | 13 ++-
mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 3 +-
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 2 +-
mlir/include/mlir/InitAllTranslations.h | 10 --
mlir/include/mlir/Target/LLVMIR.h | 4 +-
.../LLVMAVX512/LLVMAVX512ToLLVMIRTranslation.h | 37 ++++++
.../LLVMArmNeon/LLVMArmNeonToLLVMIRTranslation.h | 37 ++++++
.../LLVMArmSVE/LLVMArmSVEToLLVMIRTranslation.h | 37 ++++++
.../Dialect/LLVMIR/LLVMToLLVMIRTranslation.h | 4 +-
.../LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h | 42 +++++++
.../Dialect/OpenMP/OpenMPToLLVMIRTranslation.h | 4 +-
.../Dialect/ROCDL/ROCDLToLLVMIRTranslation.h | 42 +++++++
.../mlir/Target/LLVMIR/LLVMTranslationInterface.h | 28 ++++-
.../include/mlir/Target/LLVMIR/ModuleTranslation.h | 32 ++++--
mlir/include/mlir/Target/NVVMIR.h | 39 -------
mlir/include/mlir/Target/ROCDLIR.h | 40 -------
mlir/lib/Target/CMakeLists.txt | 107 +----------------
mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp | 35 +++++-
mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp | 124 --------------------
mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp | 127 ---------------------
mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt | 5 +
.../LLVMIR/Dialect/LLVMAVX512/CMakeLists.txt | 16 +++
.../LLVMAVX512/LLVMAVX512ToLLVMIRTranslation.cpp | 33 ++++++
.../LLVMIR/Dialect/LLVMArmNeon/CMakeLists.txt | 16 +++
.../LLVMArmNeon/LLVMArmNeonToLLVMIRTranslation.cpp | 33 ++++++
.../LLVMIR/Dialect/LLVMArmSVE/CMakeLists.txt | 16 +++
.../LLVMArmSVE/LLVMArmSVEToLLVMIRTranslation.cpp | 33 ++++++
.../Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp | 71 ++++--------
mlir/lib/Target/LLVMIR/Dialect/NVVM/CMakeLists.txt | 16 +++
.../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 67 +++++++++++
.../lib/Target/LLVMIR/Dialect/ROCDL/CMakeLists.txt | 16 +++
.../Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp | 69 +++++++++++
mlir/lib/Target/LLVMIR/LLVMAVX512Intr.cpp | 65 -----------
mlir/lib/Target/LLVMIR/LLVMArmNeonIntr.cpp | 65 -----------
mlir/lib/Target/LLVMIR/LLVMArmSVEIntr.cpp | 65 -----------
mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 65 +++++++----
mlir/test/Target/arm-neon.mlir | 2 +-
mlir/test/Target/arm-sve.mlir | 2 +-
mlir/test/Target/avx512.mlir | 2 +-
mlir/test/Target/nvvmir.mlir | 2 +-
mlir/test/Target/rocdl.mlir | 2 +-
mlir/test/lib/Transforms/CMakeLists.txt | 13 ++-
.../lib/Transforms/TestConvertGPUKernelToCubin.cpp | 25 +++-
.../lib/Transforms/TestConvertGPUKernelToHsaco.cpp | 22 +++-
mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp | 6 +-
mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp | 3 +
mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp | 10 +-
48 files changed, 750 insertions(+), 760 deletions(-)
diff --git a/mlir/examples/standalone/test/Standalone/standalone-translate.mlir b/mlir/examples/standalone/test/Standalone/standalone-translate.mlir
index 2a096c38e128..16d49785ee16 100644
--- a/mlir/examples/standalone/test/Standalone/standalone-translate.mlir
+++ b/mlir/examples/standalone/test/Standalone/standalone-translate.mlir
@@ -1,8 +1,5 @@
// RUN: standalone-translate --help | FileCheck %s
-// CHECK: --avx512-mlir-to-llvmir
// CHECK: --deserialize-spirv
// CHECK: --import-llvm
// CHECK: --mlir-to-llvmir
-// CHECK: --mlir-to-nvvmir
-// CHECK: --mlir-to-rocdlir
// CHECK: --serialize-spirv
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index ad886e55b4e6..541f7ebfadfa 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -219,12 +219,13 @@ class ListIntSubst<string pattern, list<int> values> {
// or result in the operation.
def LLVM_IntrPatterns {
string operand =
- [{convertType(opInst.getOperand($0).getType())}];
+ [{moduleTranslation.convertType(opInst.getOperand($0).getType())}];
string result =
- [{convertType(opInst.getResult($0).getType())}];
+ [{moduleTranslation.convertType(opInst.getResult($0).getType())}];
string structResult =
- [{convertType(opInst.getResult(0).getType().cast<LLVM::LLVMStructType>()
- .getBody()[$0])}];
+ [{moduleTranslation.convertType(
+ opInst.getResult(0).getType().cast<LLVM::LLVMStructType>()
+ .getBody()[$0])}];
}
@@ -259,7 +260,7 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
ListIntSubst<LLVM_IntrPatterns.operand,
overloadedOperands>.lst), ", ") # [{
});
- auto operands = lookupValues(opInst.getOperands());
+ auto operands = moduleTranslation.lookupValues(opInst.getOperands());
}] # !if(!gt(numResults, 0), "$res = ", "")
# [{builder.CreateCall(fn, operands);
}];
@@ -325,7 +326,7 @@ class LLVM_VectorReductionAcc<string mnem>
{ }] # !interleave(ListIntSubst<LLVM_IntrPatterns.operand, [1]>.lst,
", ") # [{
});
- auto operands = lookupValues(opInst.getOperands());
+ auto operands = moduleTranslation.lookupValues(opInst.getOperands());
llvm::FastMathFlags origFM = builder.getFastMathFlags();
llvm::FastMathFlags tempFM = origFM;
tempFM.setAllowReassoc($reassoc);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 7a2152b9a481..7dca08a43f7a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1083,7 +1083,8 @@ def LLVM_UndefOp : LLVM_Op<"mlir.undef", [NoSideEffect]>,
def LLVM_ConstantOp
: LLVM_Op<"mlir.constant", [NoSideEffect]>,
- LLVM_Builder<"$res = getLLVMConstant($_resultType, $value, $_location);">
+ LLVM_Builder<[{$res = getLLVMConstant($_resultType, $value, $_location,
+ moduleTranslation);}]>
{
let summary = "Defines a constant of LLVM type.";
let description = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index d5eec3cebb4a..cfb08ff465a2 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -175,7 +175,7 @@ def ROCDL_MubufStoreOp :
LLVM_Type:$glc,
LLVM_Type:$slc)>{
string llvmBuilder = [{
- auto vdataType = convertType(op.vdata().getType());
+ auto vdataType = moduleTranslation.convertType(op.vdata().getType());
createIntrinsicCall(builder,
llvm::Intrinsic::amdgcn_buffer_store, {$vdata, $rsrc, $vindex,
$offset, $glc, $slc}, {vdataType});
diff --git a/mlir/include/mlir/InitAllTranslations.h b/mlir/include/mlir/InitAllTranslations.h
index 16dd113d14cd..fc319c09a8c8 100644
--- a/mlir/include/mlir/InitAllTranslations.h
+++ b/mlir/include/mlir/InitAllTranslations.h
@@ -20,11 +20,6 @@ void registerFromLLVMIRTranslation();
void registerFromSPIRVTranslation();
void registerToLLVMIRTranslation();
void registerToSPIRVTranslation();
-void registerToNVVMIRTranslation();
-void registerToROCDLIRTranslation();
-void registerArmNeonToLLVMIRTranslation();
-void registerAVX512ToLLVMIRTranslation();
-void registerArmSVEToLLVMIRTranslation();
// This function should be called before creating any MLIRContext if one
// expects all the possible translations to be made available to the context
@@ -35,11 +30,6 @@ inline void registerAllTranslations() {
registerFromSPIRVTranslation();
registerToLLVMIRTranslation();
registerToSPIRVTranslation();
- registerToNVVMIRTranslation();
- registerToROCDLIRTranslation();
- registerArmNeonToLLVMIRTranslation();
- registerAVX512ToLLVMIRTranslation();
- registerArmSVEToLLVMIRTranslation();
return true;
}();
(void)initOnce;
diff --git a/mlir/include/mlir/Target/LLVMIR.h b/mlir/include/mlir/Target/LLVMIR.h
index 2050c63df73f..10bec79f3506 100644
--- a/mlir/include/mlir/Target/LLVMIR.h
+++ b/mlir/include/mlir/Target/LLVMIR.h
@@ -28,14 +28,14 @@ namespace mlir {
class DialectRegistry;
class OwningModuleRef;
class MLIRContext;
-class ModuleOp;
+class Operation;
/// Convert the given MLIR module into LLVM IR. The LLVM context is extracted
/// from the registered LLVM IR dialect. In case of error, report it
/// to the error handler registered with the MLIR context, if any (obtained from
/// the MLIR module), and return `nullptr`.
std::unique_ptr<llvm::Module>
-translateModuleToLLVMIR(ModuleOp m, llvm::LLVMContext &llvmContext,
+translateModuleToLLVMIR(Operation *op, llvm::LLVMContext &llvmContext,
StringRef name = "LLVMDialectModule");
/// Convert the given LLVM module into MLIR's LLVM dialect. The LLVM context is
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMAVX512/LLVMAVX512ToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMAVX512/LLVMAVX512ToLLVMIRTranslation.h
new file mode 100644
index 000000000000..e591a95f0357
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMAVX512/LLVMAVX512ToLLVMIRTranslation.h
@@ -0,0 +1,37 @@
+//===- LLVMAVX512ToLLVMIRTranslation.h - LLVMAVX512 to LLVM IR --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect interface for translating the LLVMAVX512
+// dialect to LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_LLVMAVX512_LLVMAVX512TOLLVMIRTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_LLVMAVX512_LLVMAVX512TOLLVMIRTRANSLATION_H
+
+#include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
+
+namespace mlir {
+
+/// Implementation of the dialect interface that converts operations belonging
+/// to the LLVMAVX512 dialect to LLVM IR.
+class LLVMAVX512DialectLLVMIRTranslationInterface
+ : public LLVMTranslationDialectInterface {
+public:
+ using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface;
+
+ /// Translates the given operation to LLVM IR using the provided IR builder
+ /// and saving the state in `moduleTranslation`.
+ LogicalResult
+ convertOperation(Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) const final;
+};
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_DIALECT_LLVMAVX512_LLVMAVX512TOLLVMIRTRANSLATION_H
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMArmNeon/LLVMArmNeonToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMArmNeon/LLVMArmNeonToLLVMIRTranslation.h
new file mode 100644
index 000000000000..7d268d155083
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMArmNeon/LLVMArmNeonToLLVMIRTranslation.h
@@ -0,0 +1,37 @@
+//===- LLVMArmNeonToLLVMIRTranslation.h - LLVMArmNeon to LLVMIR -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect interface for translating the LLVMArmNeon
+// dialect to LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_LLVMARMNEON_LLVMARMNEONTOLLVMIRTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_LLVMARMNEON_LLVMARMNEONTOLLVMIRTRANSLATION_H
+
+#include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
+
+namespace mlir {
+
+/// Implementation of the dialect interface that converts operations belonging
+/// to the LLVMArmNeon dialect to LLVM IR.
+class LLVMArmNeonDialectLLVMIRTranslationInterface
+ : public LLVMTranslationDialectInterface {
+public:
+ using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface;
+
+ /// Translates the given operation to LLVM IR using the provided IR builder
+ /// and saving the state in `moduleTranslation`.
+ LogicalResult
+ convertOperation(Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) const final;
+};
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_DIALECT_LLVMARMNEON_LLVMARMNEONTOLLVMIRTRANSLATION_H
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMArmSVE/LLVMArmSVEToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMArmSVE/LLVMArmSVEToLLVMIRTranslation.h
new file mode 100644
index 000000000000..9d4d05b9b9bd
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMArmSVE/LLVMArmSVEToLLVMIRTranslation.h
@@ -0,0 +1,37 @@
+//===- LLVMArmSVEToLLVMIRTranslation.h - LLVMArmSVE to LLVM IR --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect interface for translating the LLVMArmSVE
+// dialect to LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_LLVMARMSVE_LLVMARMSVETOLLVMIRTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_LLVMARMSVE_LLVMARMSVETOLLVMIRTRANSLATION_H
+
+#include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
+
+namespace mlir {
+
+/// Implementation of the dialect interface that converts operations belonging
+/// to the LLVMArmSVE dialect to LLVM IR.
+class LLVMArmSVEDialectLLVMIRTranslationInterface
+ : public LLVMTranslationDialectInterface {
+public:
+ using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface;
+
+ /// Translates the given operation to LLVM IR using the provided IR builder
+ /// and saving the state in `moduleTranslation`.
+ LogicalResult
+ convertOperation(Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) const final;
+};
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_DIALECT_LLVMARMSVE_LLVMARMSVETOLLVMIRTRANSLATION_H
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h
index 8b72cedf1ff2..2af76e092917 100644
--- a/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h
@@ -18,8 +18,8 @@
namespace mlir {
-/// Implementation of the dialect interface that converts operations beloning to
-/// the LLVM dialect to LLVM IR.
+/// Implementation of the dialect interface that converts operations belonging
+/// to the LLVM dialect to LLVM IR.
class LLVMDialectLLVMIRTranslationInterface
: public LLVMTranslationDialectInterface {
public:
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h
new file mode 100644
index 000000000000..3a8a01df84b0
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h
@@ -0,0 +1,42 @@
+//===- NVVMToLLVMIRTranslation.h - NVVM to LLVM IR --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect interface for translating the NVVM
+// dialect to LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_NVVM_NVVMTOLLVMIRTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_NVVM_NVVMTOLLVMIRTRANSLATION_H
+
+#include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
+
+namespace mlir {
+
+/// Implementation of the dialect interface that converts operations belonging
+/// to the NVVM dialect to LLVM IR.
+class NVVMDialectLLVMIRTranslationInterface
+ : public LLVMTranslationDialectInterface {
+public:
+ using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface;
+
+ /// Translates the given operation to LLVM IR using the provided IR builder
+ /// and saving the state in `moduleTranslation`.
+ LogicalResult
+ convertOperation(Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) const final;
+
+ /// Attaches module-level metadata for functions marked as kernels.
+ LogicalResult
+ amendOperation(Operation *op, NamedAttribute attribute,
+ LLVM::ModuleTranslation &moduleTranslation) const final;
+};
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_DIALECT_NVVM_NVVMTOLLVMIRTRANSLATION_H
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h
index 7d9eeea9462e..07721d089689 100644
--- a/mlir/include/mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h
@@ -18,8 +18,8 @@
namespace mlir {
-/// Implementation of the dialect interface that converts operations beloning to
-/// the OpenMP dialect to LLVM IR.
+/// Implementation of the dialect interface that converts operations belonging
+/// to the OpenMP dialect to LLVM IR.
class OpenMPDialectLLVMIRTranslationInterface
: public LLVMTranslationDialectInterface {
public:
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h
new file mode 100644
index 000000000000..e2211a59098f
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h
@@ -0,0 +1,42 @@
+//===- ROCDLToLLVMIRTranslation.h - ROCDL to LLVM IR ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect interface for translating the ROCDL
+// dialect to LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_ROCDL_ROCDLTOLLVMIRTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_ROCDL_ROCDLTOLLVMIRTRANSLATION_H
+
+#include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
+
+namespace mlir {
+
+/// Implementation of the dialect interface that converts operations belonging
+/// to the ROCDL dialect to LLVM IR.
+class ROCDLDialectLLVMIRTranslationInterface
+ : public LLVMTranslationDialectInterface {
+public:
+ using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface;
+
+ /// Translates the given operation to LLVM IR using the provided IR builder
+ /// and saving the state in `moduleTranslation`.
+ LogicalResult
+ convertOperation(Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) const final;
+
+ /// Attaches module-level metadata for functions marked as kernels.
+ LogicalResult
+ amendOperation(Operation *op, NamedAttribute attribute,
+ LLVM::ModuleTranslation &moduleTranslation) const final;
+};
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_DIALECT_ROCDL_ROCDLTOLLVMIRTRANSLATION_H
diff --git a/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h b/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h
index 0063beac2977..0c563e6e7d39 100644
--- a/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h
+++ b/mlir/include/mlir/Target/LLVMIR/LLVMTranslationInterface.h
@@ -13,12 +13,14 @@
#ifndef MLIR_TARGET_LLVMIR_LLVMTRANSLATIONINTERFACE_H
#define MLIR_TARGET_LLVMIR_LLVMTRANSLATIONINTERFACE_H
+#include "mlir/IR/Attributes.h"
#include "mlir/IR/DialectInterface.h"
+#include "mlir/IR/Identifier.h"
#include "mlir/Support/LogicalResult.h"
namespace llvm {
class IRBuilderBase;
-}
+} // namespace llvm
namespace mlir {
namespace LLVM {
@@ -43,6 +45,18 @@ public:
LLVM::ModuleTranslation &moduleTranslation) const {
return failure();
}
+
+ /// Hook for derived dialect interface to act on an operation that has dialect
+ /// attributes from the derived dialect (the operation itself may be from a
+ /// different dialect). This gets called after the operation has been
+ /// translated. The hook is expected to use moduleTranslation to look up the
+ /// translation results and amend the corresponding IR constructs. Does
+ /// nothing and succeeds by default.
+ virtual LogicalResult
+ amendOperation(Operation *op, NamedAttribute attribute,
+ LLVM::ModuleTranslation &moduleTranslation) const {
+ return success();
+ }
};
/// Interface collection for translation to LLVM IR, dispatches to a concrete
@@ -61,6 +75,18 @@ public:
return iface->convertOperation(op, builder, moduleTranslation);
return failure();
}
+
+ /// Acts on the given operation using the interface implemented by the dialect
+ /// of one of the operation's dialect attributes.
+ virtual LogicalResult
+ amendOperation(Operation *op, NamedAttribute attribute,
+ LLVM::ModuleTranslation &moduleTranslation) const {
+ if (const LLVMTranslationDialectInterface *iface =
+ getInterfaceFor(attribute.first.getDialect())) {
+ return iface->amendOperation(op, attribute, moduleTranslation);
+ }
+ return success();
+ }
};
} // namespace mlir
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 03b7f5336461..004524f33fa4 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -142,18 +142,11 @@ public:
/// Looks up remapped a list of remapped values.
SmallVector<llvm::Value *, 8> lookupValues(ValueRange values);
- /// Create an LLVM IR constant of `llvmType` from the MLIR attribute `attr`.
- /// This currently supports integer, floating point, splat and dense element
- /// attributes and combinations thereof. In case of error, report it to `loc`
- /// and return nullptr.
- llvm::Constant *getLLVMConstant(llvm::Type *llvmType, Attribute attr,
- Location loc);
-
/// Returns the MLIR context of the module being translated.
MLIRContext &getContext() { return *mlirModule->getContext(); }
/// Returns the LLVM context in which the IR is being constructed.
- llvm::LLVMContext &getLLVMContext() { return llvmModule->getContext(); }
+ llvm::LLVMContext &getLLVMContext() const { return llvmModule->getContext(); }
/// Finds an LLVM IR global value that corresponds to the given MLIR operation
/// defining a global value.
@@ -184,6 +177,10 @@ public:
LogicalResult convertBlock(Block &bb, bool ignoreArguments,
llvm::IRBuilder<> &builder);
+ /// Gets the named metadata in the LLVM IR module being constructed, creating
+ /// it if it does not exist.
+ llvm::NamedMDNode *getOrInsertNamedModuleMetadata(StringRef name);
+
protected:
/// Translate the given MLIR module expressed in MLIR LLVM IR dialect into an
/// LLVM IR module. The MLIR LLVM IR dialect holds a pointer to an
@@ -208,6 +205,9 @@ private:
LogicalResult convertGlobals();
LogicalResult convertOneFunction(LLVMFuncOp func);
+ /// Translates dialect attributes attached to the given operation.
+ LogicalResult convertDialectAttributes(Operation *op);
+
/// Original and translated module.
Operation *mlirModule;
std::unique_ptr<llvm::Module> llvmModule;
@@ -228,6 +228,8 @@ private:
/// A stateful object used to translate types.
TypeToLLVMIRTranslator typeTranslator;
+ /// A dialect interface collection used for dispatching the translation to
+ /// specific dialects.
LLVMTranslationInterface iface;
/// Mappings between original and translated values, used for lookups.
@@ -249,6 +251,20 @@ void connectPHINodes(Region ®ion, const ModuleTranslation &state);
/// Get a topologically sorted list of blocks of the given region.
llvm::SetVector<Block *> getTopologicallySortedBlocks(Region ®ion);
+
+/// Create an LLVM IR constant of `llvmType` from the MLIR attribute `attr`.
+/// This currently supports integer, floating point, splat and dense element
+/// attributes and combinations thereof. In case of error, report it to `loc`
+/// and return nullptr.
+llvm::Constant *getLLVMConstant(llvm::Type *llvmType, Attribute attr,
+ Location loc,
+ const ModuleTranslation &moduleTranslation);
+
+/// Creates a call to an LLVM IR intrinsic function with the given arguments.
+llvm::Value *createIntrinsicCall(llvm::IRBuilderBase &builder,
+ llvm::Intrinsic::ID intrinsic,
+ ArrayRef<llvm::Value *> args = {},
+ ArrayRef<llvm::Type *> tys = {});
} // namespace detail
} // namespace LLVM
diff --git a/mlir/include/mlir/Target/NVVMIR.h b/mlir/include/mlir/Target/NVVMIR.h
deleted file mode 100644
index 0cd7688e275b..000000000000
--- a/mlir/include/mlir/Target/NVVMIR.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//===- NVVMIR.h - MLIR to LLVM + NVVM IR conversion -------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the entry point for the MLIR to LLVM + NVVM IR conversion.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_TARGET_NVVMIR_H
-#define MLIR_TARGET_NVVMIR_H
-
-#include "llvm/ADT/StringRef.h"
-#include <memory>
-
-// Forward-declare LLVM classes.
-namespace llvm {
-class LLVMContext;
-class Module;
-} // namespace llvm
-
-namespace mlir {
-class Operation;
-
-/// Convert the given LLVM-module-like operation into NVVM IR. This conversion
-/// requires the registration of the LLVM IR dialect and will extract the LLVM
-/// context from the registered LLVM IR dialect. In case of error, report it to
-/// the error handler registered with the MLIR context, if any (obtained from
-/// the MLIR module), and return `nullptr`.
-std::unique_ptr<llvm::Module>
-translateModuleToNVVMIR(Operation *m, llvm::LLVMContext &llvmContext,
- llvm::StringRef name = "LLVMDialectModule");
-
-} // namespace mlir
-
-#endif // MLIR_TARGET_NVVMIR_H
diff --git a/mlir/include/mlir/Target/ROCDLIR.h b/mlir/include/mlir/Target/ROCDLIR.h
deleted file mode 100644
index e2cb812a173d..000000000000
--- a/mlir/include/mlir/Target/ROCDLIR.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===- ROCDLIR.h - MLIR to LLVM + ROCDL IR conversion -----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the entry point for the MLIR to LLVM + ROCDL IR
-// conversion.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_TARGET_ROCDLIR_H
-#define MLIR_TARGET_ROCDLIR_H
-
-#include "llvm/ADT/StringRef.h"
-#include <memory>
-
-// Forward-declare LLVM classes.
-namespace llvm {
-class LLVMContext;
-class Module;
-} // namespace llvm
-
-namespace mlir {
-class Operation;
-
-/// Convert the given LLVM-module-like operation into ROCDL IR. This conversion
-/// requires the registration of the LLVM IR dialect and will extract the LLVM
-/// context from the registered LLVM IR dialect. In case of error, report it to
-/// the error handler registered with the MLIR context, if any (obtained from
-/// the MLIR module), and return `nullptr`.
-std::unique_ptr<llvm::Module>
-translateModuleToROCDLIR(Operation *m, llvm::LLVMContext &llvmContext,
- llvm::StringRef name = "LLVMDialectModule");
-
-} // namespace mlir
-
-#endif // MLIR_TARGET_ROCDLIR_H
diff --git a/mlir/lib/Target/CMakeLists.txt b/mlir/lib/Target/CMakeLists.txt
index e951ffade6aa..a23222d37ede 100644
--- a/mlir/lib/Target/CMakeLists.txt
+++ b/mlir/lib/Target/CMakeLists.txt
@@ -24,26 +24,6 @@ add_mlir_translation_library(MLIRTargetLLVMIRModuleTranslation
MLIRTranslation
)
-add_mlir_translation_library(MLIRTargetAVX512
- LLVMIR/LLVMAVX512Intr.cpp
-
- ADDITIONAL_HEADER_DIRS
- ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
-
- DEPENDS
- MLIRLLVMAVX512ConversionsIncGen
-
- LINK_COMPONENTS
- Core
-
- LINK_LIBS PUBLIC
- MLIRIR
- MLIRLLVMAVX512
- MLIRLLVMIR
- MLIRTargetLLVMIR
- MLIRTargetLLVMIRModuleTranslation
- )
-
add_mlir_translation_library(MLIRTargetLLVMIR
LLVMIR/ConvertFromLLVMIR.cpp
LLVMIR/ConvertToLLVMIR.cpp
@@ -56,89 +36,12 @@ add_mlir_translation_library(MLIRTargetLLVMIR
IRReader
LINK_LIBS PUBLIC
+ MLIRLLVMArmNeonToLLVMIRTranslation
+ MLIRLLVMArmSVEToLLVMIRTranslation
+ MLIRLLVMAVX512ToLLVMIRTranslation
MLIRLLVMToLLVMIRTranslation
+ MLIRNVVMToLLVMIRTranslation
MLIROpenMPToLLVMIRTranslation
- MLIRTargetLLVMIRModuleTranslation
- )
-
-add_mlir_translation_library(MLIRTargetArmNeon
- LLVMIR/LLVMArmNeonIntr.cpp
-
- ADDITIONAL_HEADER_DIRS
- ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
-
- DEPENDS
- MLIRLLVMArmNeonConversionsIncGen
-
- LINK_COMPONENTS
- Core
-
- LINK_LIBS PUBLIC
- MLIRIR
- MLIRLLVMArmNeon
- MLIRLLVMIR
- MLIRTargetLLVMIR
- MLIRTargetLLVMIRModuleTranslation
- )
-
-add_mlir_translation_library(MLIRTargetArmSVE
- LLVMIR/LLVMArmSVEIntr.cpp
-
- ADDITIONAL_HEADER_DIRS
- ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
-
- DEPENDS
- MLIRLLVMArmSVEConversionsIncGen
-
- LINK_COMPONENTS
- Core
-
- LINK_LIBS PUBLIC
- MLIRIR
- MLIRLLVMArmSVE
- MLIRLLVMIR
- MLIRTargetLLVMIR
- MLIRTargetLLVMIRModuleTranslation
- )
-
-add_mlir_translation_library(MLIRTargetNVVMIR
- LLVMIR/ConvertToNVVMIR.cpp
-
- ADDITIONAL_HEADER_DIRS
- ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
-
- DEPENDS
- intrinsics_gen
-
- LINK_COMPONENTS
- Core
-
- LINK_LIBS PUBLIC
- MLIRGPU
- MLIRIR
- MLIRLLVMIR
- MLIRNVVMIR
- MLIRTargetLLVMIR
- MLIRTargetLLVMIRModuleTranslation
- )
-
-add_mlir_translation_library(MLIRTargetROCDLIR
- LLVMIR/ConvertToROCDLIR.cpp
-
- ADDITIONAL_HEADER_DIRS
- ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR
-
- DEPENDS
- intrinsics_gen
-
- LINK_COMPONENTS
- Core
-
- LINK_LIBS PUBLIC
- MLIRGPU
- MLIRIR
- MLIRLLVMIR
- MLIRROCDLIR
- MLIRTargetLLVMIR
+ MLIRROCDLToLLVMIRTranslation
MLIRTargetLLVMIRModuleTranslation
)
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
index 6b30748cc79b..42391513bacf 100644
--- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -12,9 +12,19 @@
#include "mlir/Target/LLVMIR.h"
+#include "mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMArmNeonDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMArmSVEDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMAVX512/LLVMAVX512ToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMArmNeon/LLVMArmNeonToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMArmSVE/LLVMArmSVEToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/ModuleTranslation.h"
#include "mlir/Translation.h"
@@ -26,14 +36,14 @@
using namespace mlir;
std::unique_ptr<llvm::Module>
-mlir::translateModuleToLLVMIR(ModuleOp m, llvm::LLVMContext &llvmContext,
+mlir::translateModuleToLLVMIR(Operation *op, llvm::LLVMContext &llvmContext,
StringRef name) {
auto llvmModule =
- LLVM::ModuleTranslation::translateModule<>(m, llvmContext, name);
+ LLVM::ModuleTranslation::translateModule<>(op, llvmContext, name);
if (!llvmModule)
- emitError(m.getLoc(), "Fail to convert MLIR to LLVM IR");
+ emitError(op->getLoc(), "Fail to convert MLIR to LLVM IR");
else if (verifyModule(*llvmModule))
- emitError(m.getLoc(), "LLVM IR fails to verify");
+ emitError(op->getLoc(), "LLVM IR fails to verify");
return llvmModule;
}
@@ -70,9 +80,24 @@ void registerToLLVMIRTranslation() {
return success();
},
[](DialectRegistry ®istry) {
- registry.insert<omp::OpenMPDialect>();
+ registry.insert<omp::OpenMPDialect, LLVM::LLVMAVX512Dialect,
+ LLVM::LLVMArmSVEDialect, LLVM::LLVMArmNeonDialect,
+ NVVM::NVVMDialect, ROCDL::ROCDLDialect>();
registry.addDialectInterface<omp::OpenMPDialect,
OpenMPDialectLLVMIRTranslationInterface>();
+ registry
+ .addDialectInterface<LLVM::LLVMAVX512Dialect,
+ LLVMAVX512DialectLLVMIRTranslationInterface>();
+ registry.addDialectInterface<
+ LLVM::LLVMArmNeonDialect,
+ LLVMArmNeonDialectLLVMIRTranslationInterface>();
+ registry
+ .addDialectInterface<LLVM::LLVMArmSVEDialect,
+ LLVMArmSVEDialectLLVMIRTranslationInterface>();
+ registry.addDialectInterface<NVVM::NVVMDialect,
+ NVVMDialectLLVMIRTranslationInterface>();
+ registry.addDialectInterface<ROCDL::ROCDLDialect,
+ ROCDLDialectLLVMIRTranslationInterface>();
registerLLVMDialectTranslation(registry);
});
}
diff --git a/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp
deleted file mode 100644
index 7aee913a27d7..000000000000
--- a/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-//===- ConvertToNVVMIR.cpp - MLIR to LLVM IR conversion -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a translation between the MLIR LLVM + NVVM dialects and
-// LLVM IR with NVVM intrinsics and metadata.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Target/NVVMIR.h"
-
-#include "mlir/Dialect/GPU/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/Target/LLVMIR.h"
-#include "mlir/Target/LLVMIR/ModuleTranslation.h"
-#include "mlir/Translation.h"
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/ToolOutputFile.h"
-
-using namespace mlir;
-
-static llvm::Value *createIntrinsicCall(llvm::IRBuilder<> &builder,
- llvm::Intrinsic::ID intrinsic,
- ArrayRef<llvm::Value *> args = {}) {
- llvm::Module *module = builder.GetInsertBlock()->getModule();
- llvm::Function *fn = llvm::Intrinsic::getDeclaration(module, intrinsic);
- return builder.CreateCall(fn, args);
-}
-
-static llvm::Intrinsic::ID getShflBflyIntrinsicId(llvm::Type *resultType,
- bool withPredicate) {
- if (withPredicate) {
- resultType = cast<llvm::StructType>(resultType)->getElementType(0);
- return resultType->isFloatTy() ? llvm::Intrinsic::nvvm_shfl_sync_bfly_f32p
- : llvm::Intrinsic::nvvm_shfl_sync_bfly_i32p;
- }
- return resultType->isFloatTy() ? llvm::Intrinsic::nvvm_shfl_sync_bfly_f32
- : llvm::Intrinsic::nvvm_shfl_sync_bfly_i32;
-}
-
-namespace {
-class ModuleTranslation : public LLVM::ModuleTranslation {
-public:
- using LLVM::ModuleTranslation::ModuleTranslation;
-
-protected:
- LogicalResult convertOperation(Operation &opInst,
- llvm::IRBuilder<> &builder) override {
-
-#include "mlir/Dialect/LLVMIR/NVVMConversions.inc"
-
- return LLVM::ModuleTranslation::convertOperation(opInst, builder);
- }
-
- /// Allow access to the constructor.
- friend LLVM::ModuleTranslation;
-};
-} // namespace
-
-std::unique_ptr<llvm::Module>
-mlir::translateModuleToNVVMIR(Operation *m, llvm::LLVMContext &llvmContext,
- StringRef name) {
- // Register the translation to LLVM IR if nobody else did before. This may
- // happen if this translation is called inside a pass pipeline that converts
- // GPU dialects to binary blobs without translating the rest of the code.
- registerLLVMDialectTranslation(*m->getContext());
-
- auto llvmModule = LLVM::ModuleTranslation::translateModule<ModuleTranslation>(
- m, llvmContext, name);
- if (!llvmModule)
- return llvmModule;
-
- // Insert the nvvm.annotations kernel so that the NVVM backend recognizes the
- // function as a kernel.
- for (auto func :
- ModuleTranslation::getModuleBody(m).getOps<LLVM::LLVMFuncOp>()) {
- if (!func->getAttrOfType<UnitAttr>(
- NVVM::NVVMDialect::getKernelFuncAttrName()))
- continue;
-
- auto *llvmFunc = llvmModule->getFunction(func.getName());
-
- llvm::Metadata *llvmMetadata[] = {
- llvm::ValueAsMetadata::get(llvmFunc),
- llvm::MDString::get(llvmModule->getContext(), "kernel"),
- llvm::ValueAsMetadata::get(llvm::ConstantInt::get(
- llvm::Type::getInt32Ty(llvmModule->getContext()), 1))};
- llvm::MDNode *llvmMetadataNode =
- llvm::MDNode::get(llvmModule->getContext(), llvmMetadata);
- llvmModule->getOrInsertNamedMetadata("nvvm.annotations")
- ->addOperand(llvmMetadataNode);
- }
-
- return llvmModule;
-}
-
-namespace mlir {
-void registerToNVVMIRTranslation() {
- TranslateFromMLIRRegistration registration(
- "mlir-to-nvvmir",
- [](ModuleOp module, raw_ostream &output) {
- llvm::LLVMContext llvmContext;
- auto llvmModule = mlir::translateModuleToNVVMIR(module, llvmContext);
- if (!llvmModule)
</cut>
Progress:
* UM-2 [QEMU upstream maintainership]
+ We needed an rc4 (as usual)
+ Tried to work through some of my code review patchlog, notably
some big alignment-related series from RTH
+ Sent patchseries for some small things:
+ implement last few bits of HSTR trap-to-hypervisor functionality
+ actually take an exception if PSTATE.IL gets set
+ don't assert if user asks for both an EL3 guest CPU and KVM
* QEMU-406 [QEMU support for MVE (M-profile Vector Extension; Helium)]
+ worked through rth's code review comments for fp insn patches
these are now ready to send out once QEMU makes its 6.1 release
and the previous slice of reviewed patches can get into the tree
-- PMM
Successfully identified regression in *llvm* in CI configuration tcwg_bmk_llvm_tk1/llvm-release-arm-spec2k6-O3_LTO. So far, this commit has regressed CI configurations:
- tcwg_bmk_llvm_tk1/llvm-release-arm-spec2k6-O3_LTO
Culprit:
<cut>
commit 310b35304cdf5a230c042904655583c5532d3e91
Author: Rong Xu <xur(a)google.com>
Date: Tue Feb 16 10:53:38 2021 -0800
[SampleFDO][NFC] Refactor SampleProfile.cpp
Refactor SampleProfile.cpp to use the core code in CodeGen.
The main changes are:
(1) Move SampleProfileLoaderBaseImpl class to a header file.
(2) Split SampleCoverageTracker to a head file and a cpp file.
(3) Move the common codes (common options and callsiteIsHot())
to the common cpp file.
Differential Revision: https://reviews.llvm.org/D96455
</cut>
Results regressed to (for first_bad == 310b35304cdf5a230c042904655583c5532d3e91)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_abe stage1 -- --set gcc_override_configure=--with-mode=arm --set gcc_override_configure=--disable-libsanitizer:
-8
# build_abe linux:
-7
# build_abe glibc:
-6
# build_abe stage2 -- --set gcc_override_configure=--with-mode=arm --set gcc_override_configure=--disable-libsanitizer:
-5
# build_llvm true:
-3
# true:
0
# benchmark -- -O3_LTO_marm artifacts/build-310b35304cdf5a230c042904655583c5532d3e91/results_id:
1
# 482.sphinx3,sphinx_livepretend_base.default regressed by 106
from (for last_good == cddc53ef088b68586094c9841a76b41bee3994a4)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_abe stage1 -- --set gcc_override_configure=--with-mode=arm --set gcc_override_configure=--disable-libsanitizer:
-8
# build_abe linux:
-7
# build_abe glibc:
-6
# build_abe stage2 -- --set gcc_override_configure=--with-mode=arm --set gcc_override_configure=--disable-libsanitizer:
-5
# build_llvm true:
-3
# true:
0
# benchmark -- -O3_LTO_marm artifacts/build-cddc53ef088b68586094c9841a76b41bee3994a4/results_id:
1
Artifacts of last_good build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release…
Results ID of last_good: tk1_32/tcwg_bmk_llvm_tk1/bisect-llvm-release-arm-spec2k6-O3_LTO/3917
Artifacts of first_bad build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release…
Results ID of first_bad: tk1_32/tcwg_bmk_llvm_tk1/bisect-llvm-release-arm-spec2k6-O3_LTO/3930
Build top page/logs: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release…
Configuration details:
Reproduce builds:
<cut>
mkdir investigate-llvm-310b35304cdf5a230c042904655583c5532d3e91
cd investigate-llvm-310b35304cdf5a230c042904655583c5532d3e91
git clone https://git.linaro.org/toolchain/jenkins-scripts
mkdir -p artifacts/manifests
curl -o artifacts/manifests/build-baseline.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release… --fail
curl -o artifacts/manifests/build-parameters.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release… --fail
curl -o artifacts/test.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release… --fail
chmod +x artifacts/test.sh
# Reproduce the baseline build (build all pre-requisites)
./jenkins-scripts/tcwg_bmk-build.sh @@ artifacts/manifests/build-baseline.sh
# Save baseline build state (which is then restored in artifacts/test.sh)
mkdir -p ./bisect
rsync -a --del --delete-excluded --exclude /bisect/ --exclude /artifacts/ --exclude /llvm/ ./ ./bisect/baseline/
cd llvm
# Reproduce first_bad build
git checkout --detach 310b35304cdf5a230c042904655583c5532d3e91
../artifacts/test.sh
# Reproduce last_good build
git checkout --detach cddc53ef088b68586094c9841a76b41bee3994a4
../artifacts/test.sh
cd ..
</cut>
History of pending regressions and results: https://git.linaro.org/toolchain/ci/base-artifacts.git/log/?h=linaro-local/…
Artifacts: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release…
Build log: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tk1-llvm-release…
Full commit (up to 1000 lines):
<cut>
commit 310b35304cdf5a230c042904655583c5532d3e91
Author: Rong Xu <xur(a)google.com>
Date: Tue Feb 16 10:53:38 2021 -0800
[SampleFDO][NFC] Refactor SampleProfile.cpp
Refactor SampleProfile.cpp to use the core code in CodeGen.
The main changes are:
(1) Move SampleProfileLoaderBaseImpl class to a header file.
(2) Split SampleCoverageTracker to a head file and a cpp file.
(3) Move the common codes (common options and callsiteIsHot())
to the common cpp file.
Differential Revision: https://reviews.llvm.org/D96455
---
.../llvm/ProfileData/SampleProfileLoaderBaseImpl.h | 862 +++++++++++++++++
.../llvm/ProfileData/SampleProfileLoaderBaseUtil.h | 97 ++
llvm/lib/ProfileData/CMakeLists.txt | 1 +
.../ProfileData/SampleProfileLoaderBaseUtil.cpp | 192 ++++
llvm/lib/Transforms/IPO/SampleProfile.cpp | 1001 +-------------------
5 files changed, 1161 insertions(+), 992 deletions(-)
diff --git a/llvm/include/llvm/ProfileData/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/ProfileData/SampleProfileLoaderBaseImpl.h
new file mode 100644
index 000000000000..f02bacb6edc3
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/SampleProfileLoaderBaseImpl.h
@@ -0,0 +1,862 @@
+////===- SampleProfileLoadBaseImpl.h - Profile loader base impl --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file provides the interface for the sampled PGO profile loader base
+/// implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_SAMPLEPROFILELOADERIMPL_H
+#define LLVM_TRANSFORMS_IPO_SAMPLEPROFILELOADERIMPL_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/ProfileData/SampleProfileLoaderBaseUtil.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+using namespace llvm;
+using namespace sampleprof;
+using ProfileCount = Function::ProfileCount;
+namespace sampleprofutil {
+bool callsiteIsHot(const SampleCoverageTracker *CT,
+ const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI,
+ bool ProfAccForSymsInList);
+} // namespace sampleprofutil
+using namespace sampleprofutil;
+
+#define DEBUG_TYPE "sample-profile-impl"
+
+using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
+using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
+using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
+using EdgeWeightMap = DenseMap<Edge, uint64_t>;
+using BlockEdgeMap =
+ DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
+
+extern cl::opt<unsigned> SampleProfileMaxPropagateIterations;
+extern cl::opt<unsigned> SampleProfileRecordCoverage;
+extern cl::opt<unsigned> SampleProfileSampleCoverage;
+extern cl::opt<bool> NoWarnSampleUnused;
+
+class SampleProfileLoaderBaseImpl {
+public:
+ SampleProfileLoaderBaseImpl(std::string Name) : Filename(Name) {}
+ void dump() { Reader->dump(); }
+
+protected:
+ friend class SampleCoverageTracker;
+
+ unsigned getFunctionLoc(Function &F);
+ virtual ErrorOr<uint64_t> getInstWeight(const Instruction &Inst);
+ ErrorOr<uint64_t> getInstWeightImpl(const Instruction &Inst);
+ ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
+ mutable DenseMap<const DILocation *, const FunctionSamples *>
+ DILocation2SampleMap;
+ virtual const FunctionSamples *
+ findFunctionSamples(const Instruction &I) const;
+ void printEdgeWeight(raw_ostream &OS, Edge E);
+ void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
+ void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
+ bool computeBlockWeights(Function &F);
+ void findEquivalenceClasses(Function &F);
+ template <bool IsPostDom>
+ void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
+ DominatorTreeBase<BasicBlock, IsPostDom> *DomTree);
+
+ void propagateWeights(Function &F);
+ uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
+ void buildEdges(Function &F);
+ bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
+ void clearFunctionData();
+ void computeDominanceAndLoopInfo(Function &F);
+ bool
+ computeAndPropagateWeights(Function &F,
+ const DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+ void emitCoverageRemarks(Function &F);
+
+ /// Map basic blocks to their computed weights.
+ ///
+ /// The weight of a basic block is defined to be the maximum
+ /// of all the instruction weights in that block.
+ BlockWeightMap BlockWeights;
+
+ /// Map edges to their computed weights.
+ ///
+ /// Edge weights are computed by propagating basic block weights in
+ /// SampleProfile::propagateWeights.
+ EdgeWeightMap EdgeWeights;
+
+ /// Set of visited blocks during propagation.
+ SmallPtrSet<const BasicBlock *, 32> VisitedBlocks;
+
+ /// Set of visited edges during propagation.
+ SmallSet<Edge, 32> VisitedEdges;
+
+ /// Equivalence classes for block weights.
+ ///
+ /// Two blocks BB1 and BB2 are in the same equivalence class if they
+ /// dominate and post-dominate each other, and they are in the same loop
+ /// nest. When this happens, the two blocks are guaranteed to execute
+ /// the same number of times.
+ EquivalenceClassMap EquivalenceClass;
+
+ /// Dominance, post-dominance and loop information.
+ std::unique_ptr<DominatorTree> DT;
+ std::unique_ptr<PostDominatorTree> PDT;
+ std::unique_ptr<LoopInfo> LI;
+
+ /// Predecessors for each basic block in the CFG.
+ BlockEdgeMap Predecessors;
+
+ /// Successors for each basic block in the CFG.
+ BlockEdgeMap Successors;
+
+ /// Profile coverage tracker.
+ SampleCoverageTracker CoverageTracker;
+
+ /// Profile reader object.
+ std::unique_ptr<SampleProfileReader> Reader;
+
+ /// Samples collected for the body of this function.
+ FunctionSamples *Samples = nullptr;
+
+ /// Name of the profile file to load.
+ std::string Filename;
+
+ /// Profile Summary Info computed from sample profile.
+ ProfileSummaryInfo *PSI = nullptr;
+
+ /// Optimization Remark Emitter used to emit diagnostic remarks.
+ OptimizationRemarkEmitter *ORE = nullptr;
+};
+
+/// Clear all the per-function data used to load samples and propagate weights.
+void SampleProfileLoaderBaseImpl::clearFunctionData() {
+ BlockWeights.clear();
+ EdgeWeights.clear();
+ VisitedBlocks.clear();
+ VisitedEdges.clear();
+ EquivalenceClass.clear();
+ DT = nullptr;
+ PDT = nullptr;
+ LI = nullptr;
+ Predecessors.clear();
+ Successors.clear();
+ CoverageTracker.clear();
+}
+
+#ifndef NDEBUG
+/// Print the weight of edge \p E on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param E Edge to print.
+void SampleProfileLoaderBaseImpl::printEdgeWeight(raw_ostream &OS, Edge E) {
+ OS << "weight[" << E.first->getName() << "->" << E.second->getName()
+ << "]: " << EdgeWeights[E] << "\n";
+}
+
+/// Print the equivalence class of block \p BB on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param BB Block to print.
+void SampleProfileLoaderBaseImpl::printBlockEquivalence(raw_ostream &OS,
+ const BasicBlock *BB) {
+ const BasicBlock *Equiv = EquivalenceClass[BB];
+ OS << "equivalence[" << BB->getName()
+ << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
+}
+
+/// Print the weight of block \p BB on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param BB Block to print.
+void SampleProfileLoaderBaseImpl::printBlockWeight(raw_ostream &OS,
+ const BasicBlock *BB) const {
+ const auto &I = BlockWeights.find(BB);
+ uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
+ OS << "weight[" << BB->getName() << "]: " << W << "\n";
+}
+#endif
+
+/// Get the weight for an instruction.
+///
+/// The "weight" of an instruction \p Inst is the number of samples
+/// collected on that instruction at runtime. To retrieve it, we
+/// need to compute the line number of \p Inst relative to the start of its
+/// function. We use HeaderLineno to compute the offset. We then
+/// look up the samples collected for \p Inst using BodySamples.
+///
+/// \param Inst Instruction to query.
+///
+/// \returns the weight of \p Inst.
+ErrorOr<uint64_t>
+SampleProfileLoaderBaseImpl::getInstWeight(const Instruction &Inst) {
+ return getInstWeightImpl(Inst);
+}
+
+ErrorOr<uint64_t>
+SampleProfileLoaderBaseImpl::getInstWeightImpl(const Instruction &Inst) {
+ const FunctionSamples *FS = findFunctionSamples(Inst);
+ if (!FS)
+ return std::error_code();
+
+ const DebugLoc &DLoc = Inst.getDebugLoc();
+ if (!DLoc)
+ return std::error_code();
+
+ const DILocation *DIL = DLoc;
+ uint32_t LineOffset = FunctionSamples::getOffset(DIL);
+ uint32_t Discriminator = DIL->getBaseDiscriminator();
+ ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
+ if (R) {
+ bool FirstMark =
+ CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
+ if (FirstMark) {
+ ORE->emit([&]() {
+ OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
+ Remark << "Applied " << ore::NV("NumSamples", *R);
+ Remark << " samples from profile (offset: ";
+ Remark << ore::NV("LineOffset", LineOffset);
+ if (Discriminator) {
+ Remark << ".";
+ Remark << ore::NV("Discriminator", Discriminator);
+ }
+ Remark << ")";
+ return Remark;
+ });
+ }
+ LLVM_DEBUG(dbgs() << " " << DLoc.getLine() << "."
+ << DIL->getBaseDiscriminator() << ":" << Inst
+ << " (line offset: " << LineOffset << "."
+ << DIL->getBaseDiscriminator() << " - weight: " << R.get()
+ << ")\n");
+ }
+ return R;
+}
+
+/// Compute the weight of a basic block.
+///
+/// The weight of basic block \p BB is the maximum weight of all the
+/// instructions in BB.
+///
+/// \param BB The basic block to query.
+///
+/// \returns the weight for \p BB.
+ErrorOr<uint64_t>
+SampleProfileLoaderBaseImpl::getBlockWeight(const BasicBlock *BB) {
+ uint64_t Max = 0;
+ bool HasWeight = false;
+ for (auto &I : BB->getInstList()) {
+ const ErrorOr<uint64_t> &R = getInstWeight(I);
+ if (R) {
+ Max = std::max(Max, R.get());
+ HasWeight = true;
+ }
+ }
+ return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
+}
+
+/// Compute and store the weights of every basic block.
+///
+/// This populates the BlockWeights map by computing
+/// the weights of every basic block in the CFG.
+///
+/// \param F The function to query.
+bool SampleProfileLoaderBaseImpl::computeBlockWeights(Function &F) {
+ bool Changed = false;
+ LLVM_DEBUG(dbgs() << "Block weights\n");
+ for (const auto &BB : F) {
+ ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
+ if (Weight) {
+ BlockWeights[&BB] = Weight.get();
+ VisitedBlocks.insert(&BB);
+ Changed = true;
+ }
+ LLVM_DEBUG(printBlockWeight(dbgs(), &BB));
+ }
+
+ return Changed;
+}
+
+/// Get the FunctionSamples for an instruction.
+///
+/// The FunctionSamples of an instruction \p Inst is the inlined instance
+/// in which that instruction is coming from. We traverse the inline stack
+/// of that instruction, and match it with the tree nodes in the profile.
+///
+/// \param Inst Instruction to query.
+///
+/// \returns the FunctionSamples pointer to the inlined instance.
+const FunctionSamples *SampleProfileLoaderBaseImpl::findFunctionSamples(
+ const Instruction &Inst) const {
+ const DILocation *DIL = Inst.getDebugLoc();
+ if (!DIL)
+ return Samples;
+
+ auto it = DILocation2SampleMap.try_emplace(DIL, nullptr);
+ if (it.second) {
+ it.first->second = Samples->findFunctionSamples(DIL, Reader->getRemapper());
+ }
+ return it.first->second;
+}
+
+/// Find equivalence classes for the given block.
+///
+/// This finds all the blocks that are guaranteed to execute the same
+/// number of times as \p BB1. To do this, it traverses all the
+/// descendants of \p BB1 in the dominator or post-dominator tree.
+///
+/// A block BB2 will be in the same equivalence class as \p BB1 if
+/// the following holds:
+///
+/// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2
+/// is a descendant of \p BB1 in the dominator tree, then BB2 should
+/// dominate BB1 in the post-dominator tree.
+///
+/// 2- Both BB2 and \p BB1 must be in the same loop.
+///
+/// For every block BB2 that meets those two requirements, we set BB2's
+/// equivalence class to \p BB1.
+///
+/// \param BB1 Block to check.
+/// \param Descendants Descendants of \p BB1 in either the dom or pdom tree.
+/// \param DomTree Opposite dominator tree. If \p Descendants is filled
+/// with blocks from \p BB1's dominator tree, then
+/// this is the post-dominator tree, and vice versa.
+template <bool IsPostDom>
+void SampleProfileLoaderBaseImpl::findEquivalencesFor(
+ BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
+ DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) {
+ const BasicBlock *EC = EquivalenceClass[BB1];
+ uint64_t Weight = BlockWeights[EC];
+ for (const auto *BB2 : Descendants) {
+ bool IsDomParent = DomTree->dominates(BB2, BB1);
+ bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
+ if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
+ EquivalenceClass[BB2] = EC;
+ // If BB2 is visited, then the entire EC should be marked as visited.
+ if (VisitedBlocks.count(BB2)) {
+ VisitedBlocks.insert(EC);
+ }
+
+ // If BB2 is heavier than BB1, make BB2 have the same weight
+ // as BB1.
+ //
+ // Note that we don't worry about the opposite situation here
+ // (when BB2 is lighter than BB1). We will deal with this
+ // during the propagation phase. Right now, we just want to
+ // make sure that BB1 has the largest weight of all the
+ // members of its equivalence set.
+ Weight = std::max(Weight, BlockWeights[BB2]);
+ }
+ }
+ if (EC == &EC->getParent()->getEntryBlock()) {
+ BlockWeights[EC] = Samples->getHeadSamples() + 1;
+ } else {
+ BlockWeights[EC] = Weight;
+ }
+}
+
+/// Find equivalence classes.
+///
+/// Since samples may be missing from blocks, we can fill in the gaps by setting
+/// the weights of all the blocks in the same equivalence class to the same
+/// weight. To compute the concept of equivalence, we use dominance and loop
+/// information. Two blocks B1 and B2 are in the same equivalence class if B1
+/// dominates B2, B2 post-dominates B1 and both are in the same loop.
+///
+/// \param F The function to query.
+void SampleProfileLoaderBaseImpl::findEquivalenceClasses(Function &F) {
+ SmallVector<BasicBlock *, 8> DominatedBBs;
+ LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n");
+ // Find equivalence sets based on dominance and post-dominance information.
+ for (auto &BB : F) {
+ BasicBlock *BB1 = &BB;
+
+ // Compute BB1's equivalence class once.
+ if (EquivalenceClass.count(BB1)) {
+ LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
+ continue;
+ }
+
+ // By default, blocks are in their own equivalence class.
+ EquivalenceClass[BB1] = BB1;
+
+ // Traverse all the blocks dominated by BB1. We are looking for
+ // every basic block BB2 such that:
+ //
+ // 1- BB1 dominates BB2.
+ // 2- BB2 post-dominates BB1.
+ // 3- BB1 and BB2 are in the same loop nest.
+ //
+ // If all those conditions hold, it means that BB2 is executed
+ // as many times as BB1, so they are placed in the same equivalence
+ // class by making BB2's equivalence class be BB1.
+ DominatedBBs.clear();
+ DT->getDescendants(BB1, DominatedBBs);
+ findEquivalencesFor(BB1, DominatedBBs, PDT.get());
+
+ LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
+ }
+
+ // Assign weights to equivalence classes.
+ //
+ // All the basic blocks in the same equivalence class will execute
+ // the same number of times. Since we know that the head block in
+ // each equivalence class has the largest weight, assign that weight
+ // to all the blocks in that equivalence class.
+ LLVM_DEBUG(
+ dbgs() << "\nAssign the same weight to all blocks in the same class\n");
+ for (auto &BI : F) {
+ const BasicBlock *BB = &BI;
+ const BasicBlock *EquivBB = EquivalenceClass[BB];
+ if (BB != EquivBB)
+ BlockWeights[BB] = BlockWeights[EquivBB];
+ LLVM_DEBUG(printBlockWeight(dbgs(), BB));
+ }
+}
+
+/// Visit the given edge to decide if it has a valid weight.
+///
+/// If \p E has not been visited before, we copy to \p UnknownEdge
+/// and increment the count of unknown edges.
+///
+/// \param E Edge to visit.
+/// \param NumUnknownEdges Current number of unknown edges.
+/// \param UnknownEdge Set if E has not been visited before.
+///
+/// \returns E's weight, if known. Otherwise, return 0.
+uint64_t SampleProfileLoaderBaseImpl::visitEdge(Edge E,
+ unsigned *NumUnknownEdges,
+ Edge *UnknownEdge) {
+ if (!VisitedEdges.count(E)) {
+ (*NumUnknownEdges)++;
+ *UnknownEdge = E;
+ return 0;
+ }
+
+ return EdgeWeights[E];
+}
+
+/// Propagate weights through incoming/outgoing edges.
+///
+/// If the weight of a basic block is known, and there is only one edge
+/// with an unknown weight, we can calculate the weight of that edge.
+///
+/// Similarly, if all the edges have a known count, we can calculate the
+/// count of the basic block, if needed.
+///
+/// \param F Function to process.
+/// \param UpdateBlockCount Whether we should update basic block counts that
+/// has already been annotated.
+///
+/// \returns True if new weights were assigned to edges or blocks.
+bool SampleProfileLoaderBaseImpl::propagateThroughEdges(Function &F,
+ bool UpdateBlockCount) {
+ bool Changed = false;
+ LLVM_DEBUG(dbgs() << "\nPropagation through edges\n");
+ for (const auto &BI : F) {
+ const BasicBlock *BB = &BI;
+ const BasicBlock *EC = EquivalenceClass[BB];
+
+ // Visit all the predecessor and successor edges to determine
+ // which ones have a weight assigned already. Note that it doesn't
+ // matter that we only keep track of a single unknown edge. The
+ // only case we are interested in handling is when only a single
+ // edge is unknown (see setEdgeOrBlockWeight).
+ for (unsigned i = 0; i < 2; i++) {
+ uint64_t TotalWeight = 0;
+ unsigned NumUnknownEdges = 0, NumTotalEdges = 0;
+ Edge UnknownEdge, SelfReferentialEdge, SingleEdge;
+
+ if (i == 0) {
+ // First, visit all predecessor edges.
+ NumTotalEdges = Predecessors[BB].size();
+ for (auto *Pred : Predecessors[BB]) {
+ Edge E = std::make_pair(Pred, BB);
+ TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
+ if (E.first == E.second)
+ SelfReferentialEdge = E;
+ }
+ if (NumTotalEdges == 1) {
+ SingleEdge = std::make_pair(Predecessors[BB][0], BB);
+ }
+ } else {
+ // On the second round, visit all successor edges.
+ NumTotalEdges = Successors[BB].size();
+ for (auto *Succ : Successors[BB]) {
+ Edge E = std::make_pair(BB, Succ);
+ TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
+ }
+ if (NumTotalEdges == 1) {
+ SingleEdge = std::make_pair(BB, Successors[BB][0]);
+ }
+ }
+
+ // After visiting all the edges, there are three cases that we
+ // can handle immediately:
+ //
+ // - All the edge weights are known (i.e., NumUnknownEdges == 0).
+ // In this case, we simply check that the sum of all the edges
+ // is the same as BB's weight. If not, we change BB's weight
+ // to match. Additionally, if BB had not been visited before,
+ // we mark it visited.
+ //
+ // - Only one edge is unknown and BB has already been visited.
+ // In this case, we can compute the weight of the edge by
+ // subtracting the total block weight from all the known
+ // edge weights. If the edges weight more than BB, then the
+ // edge of the last remaining edge is set to zero.
+ //
+ // - There exists a self-referential edge and the weight of BB is
+ // known. In this case, this edge can be based on BB's weight.
+ // We add up all the other known edges and set the weight on
+ // the self-referential edge as we did in the previous case.
+ //
+ // In any other case, we must continue iterating. Eventually,
+ // all edges will get a weight, or iteration will stop when
+ // it reaches SampleProfileMaxPropagateIterations.
+ if (NumUnknownEdges <= 1) {
+ uint64_t &BBWeight = BlockWeights[EC];
+ if (NumUnknownEdges == 0) {
+ if (!VisitedBlocks.count(EC)) {
+ // If we already know the weight of all edges, the weight of the
+ // basic block can be computed. It should be no larger than the sum
+ // of all edge weights.
+ if (TotalWeight > BBWeight) {
+ BBWeight = TotalWeight;
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName()
+ << " known. Set weight for block: ";
+ printBlockWeight(dbgs(), BB););
+ }
+ } else if (NumTotalEdges == 1 &&
+ EdgeWeights[SingleEdge] < BlockWeights[EC]) {
+ // If there is only one edge for the visited basic block, use the
+ // block weight to adjust edge weight if edge weight is smaller.
+ EdgeWeights[SingleEdge] = BlockWeights[EC];
+ Changed = true;
+ }
+ } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
+ // If there is a single unknown edge and the block has been
+ // visited, then we can compute E's weight.
+ if (BBWeight >= TotalWeight)
+ EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
+ else
+ EdgeWeights[UnknownEdge] = 0;
+ const BasicBlock *OtherEC;
+ if (i == 0)
+ OtherEC = EquivalenceClass[UnknownEdge.first];
+ else
+ OtherEC = EquivalenceClass[UnknownEdge.second];
+ // Edge weights should never exceed the BB weights it connects.
+ if (VisitedBlocks.count(OtherEC) &&
+ EdgeWeights[UnknownEdge] > BlockWeights[OtherEC])
+ EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
+ VisitedEdges.insert(UnknownEdge);
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "Set weight for edge: ";
+ printEdgeWeight(dbgs(), UnknownEdge));
+ }
+ } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
+ // If a block Weights 0, all its in/out edges should weight 0.
+ if (i == 0) {
+ for (auto *Pred : Predecessors[BB]) {
+ Edge E = std::make_pair(Pred, BB);
+ EdgeWeights[E] = 0;
+ VisitedEdges.insert(E);
+ }
+ } else {
+ for (auto *Succ : Successors[BB]) {
+ Edge E = std::make_pair(BB, Succ);
+ EdgeWeights[E] = 0;
+ VisitedEdges.insert(E);
+ }
+ }
+ } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
+ uint64_t &BBWeight = BlockWeights[BB];
+ // We have a self-referential edge and the weight of BB is known.
+ if (BBWeight >= TotalWeight)
+ EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
+ else
+ EdgeWeights[SelfReferentialEdge] = 0;
+ VisitedEdges.insert(SelfReferentialEdge);
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: ";
+ printEdgeWeight(dbgs(), SelfReferentialEdge));
+ }
+ if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
+ BlockWeights[EC] = TotalWeight;
+ VisitedBlocks.insert(EC);
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+/// Build in/out edge lists for each basic block in the CFG.
+///
+/// We are interested in unique edges. If a block B1 has multiple
+/// edges to another block B2, we only add a single B1->B2 edge.
+void SampleProfileLoaderBaseImpl::buildEdges(Function &F) {
+ for (auto &BI : F) {
+ BasicBlock *B1 = &BI;
+
+ // Add predecessors for B1.
+ SmallPtrSet<BasicBlock *, 16> Visited;
+ if (!Predecessors[B1].empty())
+ llvm_unreachable("Found a stale predecessors list in a basic block.");
+ for (BasicBlock *B2 : predecessors(B1))
+ if (Visited.insert(B2).second)
+ Predecessors[B1].push_back(B2);
+
+ // Add successors for B1.
+ Visited.clear();
+ if (!Successors[B1].empty())
+ llvm_unreachable("Found a stale successors list in a basic block.");
+ for (BasicBlock *B2 : successors(B1))
+ if (Visited.insert(B2).second)
+ Successors[B1].push_back(B2);
+ }
+}
+
+/// Propagate weights into edges
+///
+/// The following rules are applied to every block BB in the CFG:
+///
+/// - If BB has a single predecessor/successor, then the weight
+/// of that edge is the weight of the block.
+///
+/// - If all incoming or outgoing edges are known except one, and the
+/// weight of the block is already known, the weight of the unknown
+/// edge will be the weight of the block minus the sum of all the known
+/// edges. If the sum of all the known edges is larger than BB's weight,
+/// we set the unknown edge weight to zero.
+///
+/// - If there is a self-referential edge, and the weight of the block is
+/// known, the weight for that edge is set to the weight of the block
+/// minus the weight of the other incoming edges to that block (if
+/// known).
+void SampleProfileLoaderBaseImpl::propagateWeights(Function &F) {
+ bool Changed = true;
+ unsigned I = 0;
+
+ // If BB weight is larger than its corresponding loop's header BB weight,
+ // use the BB weight to replace the loop header BB weight.
+ for (auto &BI : F) {
+ BasicBlock *BB = &BI;
+ Loop *L = LI->getLoopFor(BB);
+ if (!L) {
+ continue;
+ }
+ BasicBlock *Header = L->getHeader();
+ if (Header && BlockWeights[BB] > BlockWeights[Header]) {
+ BlockWeights[Header] = BlockWeights[BB];
+ }
+ }
+
+ // Before propagation starts, build, for each block, a list of
+ // unique predecessors and successors. This is necessary to handle
+ // identical edges in multiway branches. Since we visit all blocks and all
+ // edges of the CFG, it is cleaner to build these lists once at the start
+ // of the pass.
+ buildEdges(F);
+
+ // Propagate until we converge or we go past the iteration limit.
+ while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+ Changed = propagateThroughEdges(F, false);
+ }
+
+ // The first propagation propagates BB counts from annotated BBs to unknown
+ // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
+ // to propagate edge weights.
+ VisitedEdges.clear();
+ Changed = true;
+ while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+ Changed = propagateThroughEdges(F, false);
+ }
+
+ // The 3rd propagation pass allows adjust annotated BB weights that are
+ // obviously wrong.
+ Changed = true;
+ while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+ Changed = propagateThroughEdges(F, true);
+ }
+}
+
+/// Generate branch weight metadata for all branches in \p F.
+///
+/// Branch weights are computed out of instruction samples using a
+/// propagation heuristic. Propagation proceeds in 3 phases:
+///
+/// 1- Assignment of block weights. All the basic blocks in the function
+/// are initial assigned the same weight as their most frequently
+/// executed instruction.
+///
+/// 2- Creation of equivalence classes. Since samples may be missing from
+/// blocks, we can fill in the gaps by setting the weights of all the
+/// blocks in the same equivalence class to the same weight. To compute
+/// the concept of equivalence, we use dominance and loop information.
+/// Two blocks B1 and B2 are in the same equivalence class if B1
+/// dominates B2, B2 post-dominates B1 and both are in the same loop.
+///
+/// 3- Propagation of block weights into edges. This uses a simple
+/// propagation heuristic. The following rules are applied to every
+/// block BB in the CFG:
+///
+/// - If BB has a single predecessor/successor, then the weight
+/// of that edge is the weight of the block.
+///
+/// - If all the edges are known except one, and the weight of the
+/// block is already known, the weight of the unknown edge will
+/// be the weight of the block minus the sum of all the known
+/// edges. If the sum of all the known edges is larger than BB's weight,
+/// we set the unknown edge weight to zero.
+///
+/// - If there is a self-referential edge, and the weight of the block is
+/// known, the weight for that edge is set to the weight of the block
+/// minus the weight of the other incoming edges to that block (if
+/// known).
+///
+/// Since this propagation is not guaranteed to finalize for every CFG, we
+/// only allow it to proceed for a limited number of iterations (controlled
+/// by -sample-profile-max-propagate-iterations).
+///
+/// FIXME: Try to replace this propagation heuristic with a scheme
+/// that is guaranteed to finalize. A work-list approach similar to
+/// the standard value propagation algorithm used by SSA-CCP might
+/// work here.
+///
+/// \param F The function to query.
+///
+/// \returns true if \p F was modified. Returns false, otherwise.
+bool SampleProfileLoaderBaseImpl::computeAndPropagateWeights(
+ Function &F, const DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+ bool Changed = (InlinedGUIDs.size() != 0);
+
+ // Compute basic block weights.
+ Changed |= computeBlockWeights(F);
+
+ if (Changed) {
+ // Add an entry count to the function using the samples gathered at the
+ // function entry.
+ // Sets the GUIDs that are inlined in the profiled binary. This is used
+ // for ThinLink to make correct liveness analysis, and also make the IR
+ // match the profiled binary before annotation.
+ F.setEntryCount(
+ ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
+ &InlinedGUIDs);
+
+ // Compute dominance and loop info needed for propagation.
+ computeDominanceAndLoopInfo(F);
+
+ // Find equivalence classes.
+ findEquivalenceClasses(F);
+
+ // Propagate weights to all edges.
+ propagateWeights(F);
+ }
+
+ return Changed;
+}
+
+void SampleProfileLoaderBaseImpl::emitCoverageRemarks(Function &F) {
+ // If coverage checking was requested, compute it now.
+ if (SampleProfileRecordCoverage) {
+ unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI);
+ unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI);
+ unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
+ if (Coverage < SampleProfileRecordCoverage) {
+ F.getContext().diagnose(DiagnosticInfoSampleProfile(
+ F.getSubprogram()->getFilename(), getFunctionLoc(F),
+ Twine(Used) + " of " + Twine(Total) + " available profile records (" +
+ Twine(Coverage) + "%) were applied",
+ DS_Warning));
+ }
+ }
+
+ if (SampleProfileSampleCoverage) {
+ uint64_t Used = CoverageTracker.getTotalUsedSamples();
+ uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI);
+ unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
+ if (Coverage < SampleProfileSampleCoverage) {
+ F.getContext().diagnose(DiagnosticInfoSampleProfile(
+ F.getSubprogram()->getFilename(), getFunctionLoc(F),
+ Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
+ Twine(Coverage) + "%) were applied",
+ DS_Warning));
+ }
+ }
+}
+
+/// Get the line number for the function header.
+///
+/// This looks up function \p F in the current compilation unit and
+/// retrieves the line number where the function is defined. This is
+/// line 0 for all the samples read from the profile file. Every line
+/// number is relative to this line.
+///
+/// \param F Function object to query.
+///
+/// \returns the line number where \p F is defined. If it returns 0,
+/// it means that there is no debug information available for \p F.
+unsigned SampleProfileLoaderBaseImpl::getFunctionLoc(Function &F) {
+ if (DISubprogram *S = F.getSubprogram())
+ return S->getLine();
+
+ if (NoWarnSampleUnused)
+ return 0;
+
+ // If the start of \p F is missing, emit a diagnostic to inform the user
+ // about the missed opportunity.
+ F.getContext().diagnose(DiagnosticInfoSampleProfile(
+ "No debug information found in function " + F.getName() +
+ ": Function profile not used",
+ DS_Warning));
+ return 0;
+}
+
+void SampleProfileLoaderBaseImpl::computeDominanceAndLoopInfo(Function &F) {
+ DT.reset(new DominatorTree);
+ DT->recalculate(F);
+
+ PDT.reset(new PostDominatorTree(F));
+
+ LI.reset(new LoopInfo);
+ LI->analyze(*DT);
+}
+
+#undef DEBUG_TYPE
+
+} // namespace llvm
+#endif // LLVM_TRANSFORMS_IPO_SAMPLEPROFILELOADERIMPL_H
diff --git a/llvm/include/llvm/ProfileData/SampleProfileLoaderBaseUtil.h b/llvm/include/llvm/ProfileData/SampleProfileLoaderBaseUtil.h
new file mode 100644
index 000000000000..37dc8d8187d9
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/SampleProfileLoaderBaseUtil.h
@@ -0,0 +1,97 @@
+////===- SampleProfileLoadBaseUtil.h - Profile loader util func --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file provides the utility functions for the sampled PGO loader base
+/// implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_SAMPLEPROFILELOADERUTIL_H
+#define LLVM_TRANSFORMS_IPO_SAMPLEPROFILELOADERUTIL_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+using namespace sampleprof;
+
+extern cl::opt<unsigned> SampleProfileMaxPropagateIterations;
+extern cl::opt<unsigned> SampleProfileRecordCoverage;
+extern cl::opt<unsigned> SampleProfileSampleCoverage;
+extern cl::opt<bool> NoWarnSampleUnused;
+
+namespace sampleprofutil {
+
+class SampleCoverageTracker {
+public:
+ bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
+ uint32_t Discriminator, uint64_t Samples);
+ unsigned computeCoverage(unsigned Used, unsigned Total) const;
+ unsigned countUsedRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
+ unsigned countBodyRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
+ uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
+ uint64_t countBodySamples(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
+
+ void clear() {
+ SampleCoverage.clear();
+ TotalUsedSamples = 0;
+ }
+ void setProfAccForSymsInList(bool V) { ProfAccForSymsInList = V; }
+
+private:
+ using BodySampleCoverageMap = std::map<LineLocation, unsigned>;
+ using FunctionSamplesCoverageMap =
+ DenseMap<const FunctionSamples *, BodySampleCoverageMap>;
+
+ /// Coverage map for sampling records.
+ ///
+ /// This map keeps a record of sampling records that have been matched to
+ /// an IR instruction. This is used to detect some form of staleness in
+ /// profiles (see flag -sample-profile-check-coverage).
+ ///
+ /// Each entry in the map corresponds to a FunctionSamples instance. This is
+ /// another map that counts how many times the sample record at the
+ /// given location has been used.
+ FunctionSamplesCoverageMap SampleCoverage;
+
+ /// Number of samples used from the profile.
+ ///
+ /// When a sampling record is used for the first time, the samples from
+ /// that record are added to this accumulator. Coverage is later computed
+ /// based on the total number of samples available in this function and
+ /// its callsites.
+ ///
+ /// Note that this accumulator tracks samples used from a single function
+ /// and all the inlined callsites. Strictly, we should have a map of counters
+ /// keyed by FunctionSamples pointers, but these stats are cleared after
+ /// every function, so we just need to keep a single counter.
+ uint64_t TotalUsedSamples = 0;
+
+ // For symbol in profile symbol list, whether to regard their profiles
+ // to be accurate. This is passed from the SampleLoader instance.
+ bool ProfAccForSymsInList = false;
+};
+
+/// Return true if the given callsite is hot wrt to hot cutoff threshold.
+bool callsiteIsHot(const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI,
+ bool ProfAccForSymsInList);
+
+} // end of namespace sampleprofutil
+} // end of namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_SAMPLEPROFILELOADERUTIL_H
diff --git a/llvm/lib/ProfileData/CMakeLists.txt b/llvm/lib/ProfileData/CMakeLists.txt
index 2a377e4d74d3..4125fac918ab 100644
--- a/llvm/lib/ProfileData/CMakeLists.txt
+++ b/llvm/lib/ProfileData/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_component_library(LLVMProfileData
InstrProfWriter.cpp
ProfileSummaryBuilder.cpp
</cut>
Successfully identified regression in *linux* in CI configuration tcwg_kernel/llvm-release-arm-next-allmodconfig. So far, this commit has regressed CI configurations:
- tcwg_kernel/llvm-release-arm-next-allmodconfig
Culprit:
<cut>
commit fad7cd3310db3099f95dd34312c77740fbc455e5
Author: Baokun Li <libaokun1(a)huawei.com>
Date: Wed Aug 4 10:12:12 2021 +0800
nbd: add the check to prevent overflow in __nbd_ioctl()
If user specify a large enough value of NBD blocks option, it may trigger
signed integer overflow which may lead to nbd->config->bytesize becomes a
large or small value, zero in particular.
UBSAN: Undefined behaviour in drivers/block/nbd.c:325:31
signed integer overflow:
1024 * 4611686155866341414 cannot be represented in type 'long long int'
[...]
Call trace:
[...]
handle_overflow+0x188/0x1dc lib/ubsan.c:192
__ubsan_handle_mul_overflow+0x34/0x44 lib/ubsan.c:213
nbd_size_set drivers/block/nbd.c:325 [inline]
__nbd_ioctl drivers/block/nbd.c:1342 [inline]
nbd_ioctl+0x998/0xa10 drivers/block/nbd.c:1395
__blkdev_driver_ioctl block/ioctl.c:311 [inline]
[...]
Although it is not a big deal, still silence the UBSAN by limit
the input value.
Reported-by: Hulk Robot <hulkci(a)huawei.com>
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Link: https://lore.kernel.org/r/20210804021212.990223-1-libaokun1@huawei.com
[axboe: dropped unlikely()]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
</cut>
Results regressed to (for first_bad == fad7cd3310db3099f95dd34312c77740fbc455e5)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_llvm:
-5
# build_abe qemu:
-2
# linux_n_obj:
21709
# First few build errors in logs:
# 00:07:12 make[1]: *** [modules-only.symvers] Error 1
# 00:07:12 make: *** [modules] Error 2
from (for last_good == da20b58d5bbbb0d23ae9530992a37d0f0d1787a4)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_llvm:
-5
# build_abe qemu:
-2
# linux_n_obj:
29751
# linux build successful:
all
Artifacts of last_good build: https://ci.linaro.org/job/tcwg_kernel-llvm-bisect-llvm-release-arm-next-all…
Artifacts of first_bad build: https://ci.linaro.org/job/tcwg_kernel-llvm-bisect-llvm-release-arm-next-all…
Build top page/logs: https://ci.linaro.org/job/tcwg_kernel-llvm-bisect-llvm-release-arm-next-all…
Configuration details:
rr[linux_git]="https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git#ecf9343…"
Reproduce builds:
<cut>
mkdir investigate-linux-fad7cd3310db3099f95dd34312c77740fbc455e5
cd investigate-linux-fad7cd3310db3099f95dd34312c77740fbc455e5
git clone https://git.linaro.org/toolchain/jenkins-scripts
mkdir -p artifacts/manifests
curl -o artifacts/manifests/build-baseline.sh https://ci.linaro.org/job/tcwg_kernel-llvm-bisect-llvm-release-arm-next-all… --fail
curl -o artifacts/manifests/build-parameters.sh https://ci.linaro.org/job/tcwg_kernel-llvm-bisect-llvm-release-arm-next-all… --fail
curl -o artifacts/test.sh https://ci.linaro.org/job/tcwg_kernel-llvm-bisect-llvm-release-arm-next-all… --fail
chmod +x artifacts/test.sh
# Reproduce the baseline build (build all pre-requisites)
./jenkins-scripts/tcwg_kernel-build.sh @@ artifacts/manifests/build-baseline.sh
# Save baseline build state (which is then restored in artifacts/test.sh)
mkdir -p ./bisect
rsync -a --del --delete-excluded --exclude /bisect/ --exclude /artifacts/ --exclude /linux/ ./ ./bisect/baseline/
cd linux
# Reproduce first_bad build
git checkout --detach fad7cd3310db3099f95dd34312c77740fbc455e5
../artifacts/test.sh
# Reproduce last_good build
git checkout --detach da20b58d5bbbb0d23ae9530992a37d0f0d1787a4
../artifacts/test.sh
cd ..
</cut>
History of pending regressions and results: https://git.linaro.org/toolchain/ci/base-artifacts.git/log/?h=linaro-local/…
Artifacts: https://ci.linaro.org/job/tcwg_kernel-llvm-bisect-llvm-release-arm-next-all…
Build log: https://ci.linaro.org/job/tcwg_kernel-llvm-bisect-llvm-release-arm-next-all…
Full commit (up to 1000 lines):
<cut>
commit fad7cd3310db3099f95dd34312c77740fbc455e5
Author: Baokun Li <libaokun1(a)huawei.com>
Date: Wed Aug 4 10:12:12 2021 +0800
nbd: add the check to prevent overflow in __nbd_ioctl()
If user specify a large enough value of NBD blocks option, it may trigger
signed integer overflow which may lead to nbd->config->bytesize becomes a
large or small value, zero in particular.
UBSAN: Undefined behaviour in drivers/block/nbd.c:325:31
signed integer overflow:
1024 * 4611686155866341414 cannot be represented in type 'long long int'
[...]
Call trace:
[...]
handle_overflow+0x188/0x1dc lib/ubsan.c:192
__ubsan_handle_mul_overflow+0x34/0x44 lib/ubsan.c:213
nbd_size_set drivers/block/nbd.c:325 [inline]
__nbd_ioctl drivers/block/nbd.c:1342 [inline]
nbd_ioctl+0x998/0xa10 drivers/block/nbd.c:1395
__blkdev_driver_ioctl block/ioctl.c:311 [inline]
[...]
Although it is not a big deal, still silence the UBSAN by limit
the input value.
Reported-by: Hulk Robot <hulkci(a)huawei.com>
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Link: https://lore.kernel.org/r/20210804021212.990223-1-libaokun1@huawei.com
[axboe: dropped unlikely()]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
drivers/block/nbd.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index c38317979f74..f82264835794 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1384,6 +1384,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
unsigned int cmd, unsigned long arg)
{
struct nbd_config *config = nbd->config;
+ loff_t bytesize;
switch (cmd) {
case NBD_DISCONNECT:
@@ -1398,8 +1399,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
case NBD_SET_SIZE:
return nbd_set_size(nbd, arg, config->blksize);
case NBD_SET_SIZE_BLOCKS:
- return nbd_set_size(nbd, arg * config->blksize,
- config->blksize);
+ if (check_mul_overflow((loff_t)arg, config->blksize, &bytesize))
+ return -EINVAL;
+ return nbd_set_size(nbd, bytesize, config->blksize);
case NBD_SET_TIMEOUT:
nbd_set_cmd_timeout(nbd, arg);
return 0;
</cut>
Successfully identified regression in *llvm* in CI configuration tcwg_bmk_llvm_tx1/llvm-release-aarch64-spec2k6-O3. So far, this commit has regressed CI configurations:
- tcwg_bmk_llvm_tx1/llvm-release-aarch64-spec2k6-O3
Culprit:
<cut>
commit 6998f8ae2d14e096aff33968f226587b5c1a193a
Author: David Sherwood <david.sherwood(a)arm.com>
Date: Wed Mar 10 08:34:19 2021 +0000
[LoopVectorize] Simplify scalar cost calculation in getInstructionCost
This patch simplifies the calculation of certain costs in
getInstructionCost when isScalarAfterVectorization() returns a true value.
There are a few places where we multiply a cost by a number N, i.e.
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
return N * TTI.getArithmeticInstrCost(...
After some investigation it seems that there are only these cases that occur
in practice:
1. VF is a scalar, in which case N = 1.
2. VF is a vector. We can only get here if: a) the instruction is a
GEP/bitcast/PHI with scalar uses, or b) this is an update to an induction
variable that remains scalar.
I have changed the code so that N is assumed to always be 1. For GEPs
the cost is always 0, since this is calculated later on as part of the
load/store cost. PHI nodes are costed separately and were never previously
multiplied by VF. For all other cases I have added an assert that none of
the users needs scalarising, which didn't fire in any unit tests.
Only one test required fixing and I believe the original cost for the scalar
add instruction to have been wrong, since only one copy remains after
vectorisation.
I have also added a new test for the case when a pointer PHI feeds directly
into a store that will be scalarised as we were previously never testing it.
Differential Revision: https://reviews.llvm.org/D99718
</cut>
Results regressed to (for first_bad == 6998f8ae2d14e096aff33968f226587b5c1a193a)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_abe stage1 -- --set gcc_override_configure=--disable-libsanitizer:
-8
# build_abe linux:
-7
# build_abe glibc:
-6
# build_abe stage2 -- --set gcc_override_configure=--disable-libsanitizer:
-5
# build_llvm true:
-3
# true:
0
# benchmark -- -O3 artifacts/build-6998f8ae2d14e096aff33968f226587b5c1a193a/results_id:
1
# 462.libquantum,libquantum_base.default regressed by 114
# 462.libquantum,[.] quantum_toffoli regressed by 123
# 462.libquantum,[.] quantum_cnot regressed by 115
from (for last_good == c835630c25a4f9925517949579f66a43b113fbc9)
# reset_artifacts:
-10
# build_abe binutils:
-9
# build_abe stage1 -- --set gcc_override_configure=--disable-libsanitizer:
-8
# build_abe linux:
-7
# build_abe glibc:
-6
# build_abe stage2 -- --set gcc_override_configure=--disable-libsanitizer:
-5
# build_llvm true:
-3
# true:
0
# benchmark -- -O3 artifacts/build-c835630c25a4f9925517949579f66a43b113fbc9/results_id:
1
Artifacts of last_good build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release…
Results ID of last_good: tx1_64/tcwg_bmk_llvm_tx1/bisect-llvm-release-aarch64-spec2k6-O3/3744
Artifacts of first_bad build: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release…
Results ID of first_bad: tx1_64/tcwg_bmk_llvm_tx1/bisect-llvm-release-aarch64-spec2k6-O3/3755
Build top page/logs: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release…
Configuration details:
Reproduce builds:
<cut>
mkdir investigate-llvm-6998f8ae2d14e096aff33968f226587b5c1a193a
cd investigate-llvm-6998f8ae2d14e096aff33968f226587b5c1a193a
git clone https://git.linaro.org/toolchain/jenkins-scripts
mkdir -p artifacts/manifests
curl -o artifacts/manifests/build-baseline.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release… --fail
curl -o artifacts/manifests/build-parameters.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release… --fail
curl -o artifacts/test.sh https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release… --fail
chmod +x artifacts/test.sh
# Reproduce the baseline build (build all pre-requisites)
./jenkins-scripts/tcwg_bmk-build.sh @@ artifacts/manifests/build-baseline.sh
# Save baseline build state (which is then restored in artifacts/test.sh)
mkdir -p ./bisect
rsync -a --del --delete-excluded --exclude /bisect/ --exclude /artifacts/ --exclude /llvm/ ./ ./bisect/baseline/
cd llvm
# Reproduce first_bad build
git checkout --detach 6998f8ae2d14e096aff33968f226587b5c1a193a
../artifacts/test.sh
# Reproduce last_good build
git checkout --detach c835630c25a4f9925517949579f66a43b113fbc9
../artifacts/test.sh
cd ..
</cut>
History of pending regressions and results: https://git.linaro.org/toolchain/ci/base-artifacts.git/log/?h=linaro-local/…
Artifacts: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release…
Build log: https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-release…
Full commit (up to 1000 lines):
<cut>
commit 6998f8ae2d14e096aff33968f226587b5c1a193a
Author: David Sherwood <david.sherwood(a)arm.com>
Date: Wed Mar 10 08:34:19 2021 +0000
[LoopVectorize] Simplify scalar cost calculation in getInstructionCost
This patch simplifies the calculation of certain costs in
getInstructionCost when isScalarAfterVectorization() returns a true value.
There are a few places where we multiply a cost by a number N, i.e.
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
return N * TTI.getArithmeticInstrCost(...
After some investigation it seems that there are only these cases that occur
in practice:
1. VF is a scalar, in which case N = 1.
2. VF is a vector. We can only get here if: a) the instruction is a
GEP/bitcast/PHI with scalar uses, or b) this is an update to an induction
variable that remains scalar.
I have changed the code so that N is assumed to always be 1. For GEPs
the cost is always 0, since this is calculated later on as part of the
load/store cost. PHI nodes are costed separately and were never previously
multiplied by VF. For all other cases I have added an assert that none of
the users needs scalarising, which didn't fire in any unit tests.
Only one test required fixing and I believe the original cost for the scalar
add instruction to have been wrong, since only one copy remains after
vectorisation.
I have also added a new test for the case when a pointer PHI feeds directly
into a store that will be scalarised as we were previously never testing it.
Differential Revision: https://reviews.llvm.org/D99718
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 73 +++++++++++++---------
.../AArch64/no_vector_instructions.ll | 2 +-
.../LoopVectorize/AArch64/predication_costs.ll | 35 +++++++++++
.../Transforms/LoopVectorize/scalarized-bitcast.ll | 40 ++++++++++++
4 files changed, 121 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2b413fc49505..f25af23c86c2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7383,10 +7383,39 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Type *RetTy = I->getType();
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
- VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
auto SE = PSE.getSE();
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ auto hasSingleCopyAfterVectorization = [this](Instruction *I,
+ ElementCount VF) -> bool {
+ if (VF.isScalar())
+ return true;
+
+ auto Scalarized = InstsToScalarize.find(VF);
+ assert(Scalarized != InstsToScalarize.end() &&
+ "VF not yet analyzed for scalarization profitability");
+ return !Scalarized->second.count(I) &&
+ llvm::all_of(I->users(), [&](User *U) {
+ auto *UI = cast<Instruction>(U);
+ return !Scalarized->second.count(UI);
+ });
+ };
+
+ if (isScalarAfterVectorization(I, VF)) {
+ // With the exception of GEPs and PHIs, after scalarization there should
+ // only be one copy of the instruction generated in the loop. This is
+ // because the VF is either 1, or any instructions that need scalarizing
+ // have already been dealt with by the the time we get here. As a result,
+ // it means we don't have to multiply the instruction cost by VF.
+ assert(I->getOpcode() == Instruction::GetElementPtr ||
+ I->getOpcode() == Instruction::PHI ||
+ (I->getOpcode() == Instruction::BitCast &&
+ I->getType()->isPointerTy()) ||
+ hasSingleCopyAfterVectorization(I, VF));
+ VectorTy = RetTy;
+ } else
+ VectorTy = ToVectorTy(RetTy, VF);
+
// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
@@ -7514,21 +7543,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Op2VK = TargetTransformInfo::OK_UniformValue;
SmallVector<const Value *, 4> Operands(I->operand_values());
- unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
- return N * TTI.getArithmeticInstrCost(
- I->getOpcode(), VectorTy, CostKind,
- TargetTransformInfo::OK_AnyValue,
- Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
+ return TTI.getArithmeticInstrCost(
+ I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
+ Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
}
case Instruction::FNeg: {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
- unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
- return N * TTI.getArithmeticInstrCost(
- I->getOpcode(), VectorTy, CostKind,
- TargetTransformInfo::OK_AnyValue,
- TargetTransformInfo::OK_AnyValue,
- TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
- I->getOperand(0), I);
+ return TTI.getArithmeticInstrCost(
+ I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None, I->getOperand(0), I);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
@@ -7583,6 +7607,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
VectorTy = ToVectorTy(getMemInstValueType(I), Width);
return getMemoryInstructionCost(I, VF);
}
+ case Instruction::BitCast:
+ if (I->getType()->isPointerTy())
+ return 0;
+ LLVM_FALLTHROUGH;
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -7593,8 +7621,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
+ case Instruction::FPTrunc: {
// Computes the CastContextHint from a Load/Store instruction.
auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -7672,14 +7699,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
}
}
- unsigned N;
- if (isScalarAfterVectorization(I, VF)) {
- assert(!VF.isScalable() && "VF is assumed to be non scalable");
- N = VF.getKnownMinValue();
- } else
- N = 1;
- return N *
- TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
+ return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
case Instruction::Call: {
bool NeedToScalarize;
@@ -7694,11 +7714,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
case Instruction::ExtractValue:
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
default:
- // The cost of executing VF copies of the scalar instruction. This opcode
- // is unknown. Assume that it is the same as 'mul'.
- return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
- Instruction::Mul, VectorTy, CostKind) +
- getScalarizationOverhead(I, VF);
+ // This opcode is unknown. Assume that it is the same as 'mul'.
+ return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
} // end of switch.
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
index 247ea35ff5d0..3061998518ad 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -6,7 +6,7 @@ target triple = "aarch64--linux-gnu"
; CHECK-LABEL: all_scalar
; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
;
define void @all_scalar(i64* %a, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index b0ebb4edf2ad..858b28ddd321 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -86,6 +86,41 @@ for.end:
ret void
}
+; CHECK-LABEL: predicated_store_phi
+;
+; Same as predicate_store except we use a pointer PHI to maintain the address
+;
+; CHECK: Found new scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
+; CHECK: Found new scalar instruction: %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
+; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4
+; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
+; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %addr, align 4
+;
+define void @predicated_store_phi(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+ %addr = phi i32 * [ %a, %entry ], [ %addr.next, %for.inc ]
+ %tmp1 = load i32, i32* %addr, align 4
+ %tmp2 = add nsw i32 %tmp1, %x
+ br i1 %c, label %if.then, label %for.inc
+
+if.then:
+ store i32 %tmp2, i32* %addr, align 4
+ br label %for.inc
+
+for.inc:
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
; CHECK-LABEL: predicated_udiv_scalarized_operand
;
; This test checks that we correctly compute the cost of the predicated udiv
diff --git a/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll b/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll
new file mode 100644
index 000000000000..0c97e6ac475e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalarized-bitcast.ll
@@ -0,0 +1,40 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -S -o - < %s 2>&1 | FileCheck %s
+
+%struct.foo = type { i32, i64 }
+
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %0 = bitcast i64* %b to i32*
+
+; The bitcast below will be scalarized due to the predication in the loop. Bitcasts
+; between pointer types should be treated as free, despite the scalarization.
+define void @foo(%struct.foo* noalias nocapture %in, i32* noalias nocapture readnone %out, i64 %n) {
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %if.end
+ %i.012 = phi i64 [ %inc, %if.end ], [ 0, %entry ]
+ %b = getelementptr inbounds %struct.foo, %struct.foo* %in, i64 %i.012, i32 1
+ %0 = bitcast i64* %b to i32*
+ %a = getelementptr inbounds %struct.foo, %struct.foo* %in, i64 %i.012, i32 0
+ %1 = load i32, i32* %a, align 8
+ %tobool.not = icmp eq i32 %1, 0
+ br i1 %tobool.not, label %if.end, label %land.lhs.true
+
+land.lhs.true: ; preds = %for.body
+ %2 = load i32, i32* %0, align 4
+ %cmp2 = icmp sgt i32 %2, 0
+ br i1 %cmp2, label %if.then, label %if.end
+
+if.then: ; preds = %land.lhs.true
+ %sub = add nsw i32 %2, -1
+ store i32 %sub, i32* %0, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %land.lhs.true, %for.body
+ %inc = add nuw nsw i64 %i.012, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %if.end
+ ret void
+}
</cut>