From b2b5e4ff50e348674a99c5463126df22985a91b6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?= Date: Thu, 15 Aug 2024 17:20:46 +0300 Subject: [PATCH] [PATCH] WG-vectorizer: Re-enable LLVM vectorizers The LLVM vectorizers were accidentally disabled when transitioning to the new PM. This commit re-enables them and exposes some new inefficiencies (to be continued...). Yeah, we should also add a perf.regression test or at least a "smoke test" for the WG vectorization. (cherry picked from commit 19dc70bbe927a6f6210ccd45b6a4c2a49fe6da96) Gbp-Pq: Name 0001-WG-vectorizer-Re-enable-LLVM-vectorizers.patch --- lib/CL/pocl_llvm_api.h | 9 +++++++-- lib/CL/pocl_llvm_wg.cc | 45 +++++++++++++++++++++++++++++++----------- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/lib/CL/pocl_llvm_api.h b/lib/CL/pocl_llvm_api.h index 0542974..9b90c92 100644 --- a/lib/CL/pocl_llvm_api.h +++ b/lib/CL/pocl_llvm_api.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -101,8 +102,12 @@ POCL_EXPORT bool getModuleBoolMetadata (const llvm::Module &mod, * SizeL - optimize for size * Vectorize - whether to invoke the vectorizer (only used for legacy PM) */ -POCL_EXPORT void populateModulePM (void *Passes, void *Module, unsigned OptL, - unsigned SizeL, bool Vectorize = true); +POCL_EXPORT void populateModulePM (void *Passes, + void *Module, + unsigned OptL, + unsigned SizeL, + bool Vectorize = true, + llvm::TargetMachine *TM = nullptr); extern std::string CurrentWgMethod; diff --git a/lib/CL/pocl_llvm_wg.cc b/lib/CL/pocl_llvm_wg.cc index 5eff8ff..041c75a 100644 --- a/lib/CL/pocl_llvm_wg.cc +++ b/lib/CL/pocl_llvm_wg.cc @@ -200,7 +200,7 @@ llvm::Error PoCLModulePassManager::build(std::string PoclPipeline, // devices do not want to vectorize intra work-item at this // stage. Vectorize = ((CurrentWgMethod == "loopvec" || CurrentWgMethod == "cbs") && - (Dev->spmd == CL_FALSE)); + (!Dev->spmd)); PTO.SLPVectorization = Vectorize; PTO.LoopVectorization = Vectorize; OptimizeLevel = OLevel; @@ -273,7 +273,6 @@ llvm::Error PoCLModulePassManager::build(std::string PoclPipeline, #endif pocl::registerFunctionAnalyses(PB); - // Register all the basic analyses with the managers. PB.registerModuleAnalyses(MAM); PB.registerCGSCCAnalyses(CGAM); @@ -313,7 +312,7 @@ void PoCLModulePassManager::run(llvm::Module &Bitcode) { PM.run(Bitcode, MAM); #ifdef SEPARATE_OPTIMIZATION_FROM_POCL_PASSES populateModulePM(nullptr, (void *)&Bitcode, OptimizeLevel, SizeLevel, - Vectorize); + Vectorize, Machine.get()); #endif } @@ -532,7 +531,7 @@ static void addStage2PassesToPipeline(cl_device_id Dev, // NOTE: if you add a new PoCL pass here, // don't forget to register it in registerPassBuilderPasses - if (Dev->spmd == CL_FALSE) { + if (!Dev->spmd) { addPass(Passes, "simplifycfg"); addPass(Passes, "loop-simplify"); @@ -1528,7 +1527,7 @@ int pocl_llvm_codegen(cl_device_id Device, cl_program program, void *Modp, } void populateModulePM(void *Passes, void *Module, unsigned OptL, unsigned SizeL, - bool Vectorize) { + bool Vectorize, TargetMachine *TM) { #if LLVM_MAJOR < MIN_LLVM_NEW_PASSMANAGER PassManagerBuilder Builder; Builder.OptLevel = OptL; @@ -1555,18 +1554,42 @@ void populateModulePM(void *Passes, void *Module, unsigned OptL, unsigned SizeL, LegacyPasses->run(*Mod); } #else + + PipelineTuningOptions PTO; + + // Let the loopvec decide when to unroll. + PTO.LoopUnrolling = false; +#if LLVM_MAJOR > 16 + PTO.UnifiedLTO = false; +#endif + PTO.SLPVectorization = Vectorize; + PTO.LoopVectorization = Vectorize; + +#ifdef DEBUG_NEW_PASS_MANAGER + PrintPassOptions PrintPassOpts; + PassInstrumentationCallbacks PIC; + llvm::LLVMContext Context; // for SI + std::unique_ptr SI; + PrintPassOpts.Verbose = true; + PrintPassOpts.SkipAnalyses = false; + PrintPassOpts.Indent = true; + SI.reset(new StandardInstrumentations(Context, + true, // debug logging + false, // verify each + PrintPassOpts)); + SI->registerCallbacks(PIC, &MAM); + + PassBuilder PB(TM, PTO, std::nullopt, &PIC); +#else + PassBuilder PB(TM, PTO); +#endif + // Create the analysis managers. LoopAnalysisManager LAM; FunctionAnalysisManager FAM; CGSCCAnalysisManager CGAM; ModuleAnalysisManager MAM; - // Create the new pass manager builder. - // Take a look at the PassBuilder constructor parameters for more - // customization, e.g. specifying a TargetMachine or various debugging - // options. - PassBuilder PB; - // Register all the basic analyses with the managers. PB.registerModuleAnalyses(MAM); PB.registerCGSCCAnalyses(CGAM); -- 2.30.2