nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
at devShellTools-shell 345 lines 15 kB view raw
1diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py 2index b8cea84558..1bc24bd1dd 100644 3--- a/Tensile/TensileCreateLibrary.py 4+++ b/Tensile/TensileCreateLibrary.py 5@@ -41,6 +41,7 @@ 6 from .SolutionLibrary import MasterSolutionLibrary 7 from .SolutionStructs import Solution 8 from .CustomYamlLoader import load_logic_gfx_arch 9+from .Utilities.Profile import profile 10 11 import argparse 12 import collections 13@@ -1233,7 +1234,7 @@ def validateLibrary(masterLibraries: MasterSolutionLibrary, 14 ################################################################################ 15 # Tensile Create Library 16 ################################################################################ 17-@timing 18+@profile 19 def TensileCreateLibrary(): 20 print1("") 21 print1(HR) 22@@ -1558,7 +1559,6 @@ def param(key, value): 23 24 print1("# Check if generated files exists.") 25 26- @timing 27 def checkFileExistence(files): 28 for filePath in files: 29 if not os.path.exists(filePath): 30diff --git a/Tensile/Utilities/Profile.py b/Tensile/Utilities/Profile.py 31new file mode 100644 32index 0000000000..cc3c7eb44c 33--- /dev/null 34+++ b/Tensile/Utilities/Profile.py 35@@ -0,0 +1,77 @@ 36+################################################################################ 37+# 38+# Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved. 39+# 40+# Permission is hereby granted, free of charge, to any person obtaining a copy 41+# of this software and associated documentation files (the "Software"), to deal 42+# in the Software without restriction, including without limitation the rights 43+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 44+# copies of the Software, and to permit persons to whom the Software is 45+# furnished to do so, subject to the following conditions: 46+# 47+# The above copyright notice and this permission notice shall be included in 48+# all copies or substantial portions of the Software. 49+# 50+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 51+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 52+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 53+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 54+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 55+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 56+# SOFTWARE. 57+# 58+################################################################################ 59+ 60+import cProfile 61+import pstats 62+import os 63+ 64+from pathlib import Path 65+from datetime import datetime, timezone 66+from typing import Callable, Tuple 67+ 68+PROFILE_ENV_VAR = "TENSILE_PROFILE" 69+ 70+def profile(func: Callable) -> Callable: 71+ """Profiling decorator. 72+ 73+ Add ``@profile`` to mark a function for profiling; set the environment variable 74+ TENSILE_PROFILE=ON to enable profiling decorated functions. 75+ """ 76+ if not envVariableIsSet(PROFILE_ENV_VAR): 77+ return func 78+ def wrapper(*args, **kwargs): 79+ path, filename = initProfileArtifacts(func.__name__) 80+ 81+ prof = cProfile.Profile() 82+ output = prof.runcall(func, *args, **kwargs) 83+ result = pstats.Stats(prof) 84+ result.sort_stats(pstats.SortKey.TIME) 85+ result.dump_stats(path/filename) 86+ 87+ return output 88+ return wrapper 89+ 90+def envVariableIsSet(varName: str) -> bool: 91+ """Checks if the provided environment variable is set to "ON", "TRUE", or "1" 92+ Args: 93+ varName: Environment variable name. 94+ Returns: 95+ True if the environment variable is set, otherwise False. 96+ """ 97+ value = os.environ.get(varName, "").upper() 98+ return True if value in ["ON", "TRUE", "1"] else False 99+ 100+def initProfileArtifacts(funcName: str) -> Tuple[Path, str]: 101+ """Initializes filenames and paths for profiling artifacts based on the current datetime 102+ Args: 103+ funcName: The name of the function being profiled, nominally passed via func.__name__ 104+ Returns: 105+ A tuple (path, filename) where the path is the artifact directory and filename is 106+ a .prof file with the profiling results. 107+ """ 108+ dt = datetime.now(timezone.utc) 109+ filename = f"{funcName}-{dt.strftime('%Y-%m-%dT%H-%M-%SZ')}.prof" 110+ path = Path().cwd()/f"profiling-results-{dt.strftime('%Y-%m-%d')}" 111+ path.mkdir(exist_ok=True) 112+ return path, filename 113 114diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py 115index e62b0072df..2c843ba936 100644 116--- a/Tensile/TensileCreateLibrary.py 117+++ b/Tensile/TensileCreateLibrary.py 118@@ -56,7 +56,7 @@ 119 import sys 120 from timeit import default_timer as timer 121 from pathlib import Path 122-from typing import Sequence, List 123+from typing import Sequence, List, Union 124 125 def timing(func): 126 def wrapper(*args, **kwargs): 127@@ -90,87 +90,142 @@ def processKernelSource(kernel, kernelWriterAssembly, ti): 128 129 return (err, src, header, kernelName, filename) 130 131-def getAssemblyCodeObjectFiles(kernels, kernelWriterAssembly, outputPath): 132- destDir = ensurePath(os.path.join(outputPath, 'library')) 133- asmDir = kernelWriterAssembly.getAssemblyDirectory() 134- archs = collections.defaultdict(list) 135+def linkIntoCodeObject( 136+ objFiles: List[str], coPathDest: Union[Path, str], kernelWriterAssembly: KernelWriterAssembly 137+): 138+ """Links object files into a code object file. 139 140- for k in filter(lambda k: k['KernelLanguage'] == 'Assembly', kernels): 141- archs[tuple(k['ISA'])].append(k) 142+ Args: 143+ objectFiles: A list of object files to be linked. 144+ coPathDest: The destination path for the code object file. 145+ kernelWriterAssembly: An instance of KernelWriterAssembly to get link arguments. 146 147- coFiles = [] 148+ Raises: 149+ RuntimeError: If linker invocation fails. 150+ """ 151+ if os.name == "nt": 152+ # On Windows, the objectFiles list command line (including spaces) 153+ # exceeds the limit of 8191 characters, so using response file 154+ 155+ responseFile = os.path.join('/tmp', 'clangArgs.txt') 156+ with open(responseFile, 'wt') as file: 157+ file.write(" ".join(objFiles)) 158+ file.flush() 159+ 160+ args = [globalParameters['AssemblerPath'], '-target', 'amdgcn-amd-amdhsa', '-o', coFileRaw, '@clangArgs.txt'] 161+ subprocess.check_call(args, cwd=asmDir) 162+ else: 163+ numObjFiles = len(objFiles) 164+ maxObjFiles = 10000 165+ 166+ if numObjFiles > maxObjFiles: 167+ batchedObjFiles = [objFiles[i:i+maxObjFiles] for i in range(0, numObjFiles, maxObjFiles)] 168+ batchSize = int(math.ceil(numObjFiles / maxObjFiles)) 169+ 170+ newObjFiles = [str(coPathDest) + "." + str(i) for i in range(0, batchSize)] 171+ newObjFilesOutput = [] 172+ 173+ for batch, filename in zip(batchedObjFiles, newObjFiles): 174+ if len(batch) > 1: 175+ args = [globalParameters["ROCmLdPath"], "-r"] + batch + [ "-o", filename] 176+ print2(f"Linking object files into fewer object files: {' '.join(args)}") 177+ subprocess.check_call(args) 178+ newObjFilesOutput.append(filename) 179+ else: 180+ newObjFilesOutput.append(batchedObjFiles[0]) 181+ 182+ args = kernelWriterAssembly.getLinkCodeObjectArgs(newObjFilesOutput, str(coPathDest)) 183+ print2(f"Linking object files into code object: {' '.join(args)}") 184+ subprocess.check_call(args) 185+ else: 186+ args = kernelWriterAssembly.getLinkCodeObjectArgs(objFiles, str(coPathDest)) 187+ print2(f"Linking object files into code object: {' '.join(args)}") 188+ subprocess.check_call(args) 189+ 190+ 191+def compressCodeObject( 192+ coPathSrc: Union[Path, str], coPathDest: Union[Path, str], gfx: str, bundler: str 193+): 194+ """Compresses a code object file using the provided bundler. 195+ 196+ Args: 197+ coPathSrc: The source path of the code object file to be compressed. 198+ coPathDest: The destination path for the compressed code object file. 199+ gfx: The target GPU architecture. 200+ bundler: The path to the Clang Offload Bundler executable. 201+ 202+ Raises: 203+ RuntimeError: If compressing the code object file fails. 204+ """ 205+ args = [ 206+ bundler, 207+ "--compress", 208+ "--type=o", 209+ "--bundle-align=4096", 210+ f"--targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--{gfx}", 211+ "--input=/dev/null", 212+ f"--input={str(coPathSrc)}", 213+ f"--output={str(coPathDest)}", 214+ ] 215+ 216+ print2(f"Bundling/compressing code objects: {' '.join(args)}") 217+ try: 218+ out = subprocess.check_output(args, stderr=subprocess.STDOUT) 219+ print2(f"Output: {out}") 220+ except subprocess.CalledProcessError as err: 221+ raise RuntimeError( 222+ f"Error compressing code object via bundling: {err.output}\nFailed command: {' '.join(args)}" 223+ ) 224+ 225+def buildAssemblyCodeObjectFiles(kernels, kernelWriterAssembly, outputPath): 226+ 227+ isAsm = lambda k: k["KernelLanguage"] == "Assembly" 228+ 229+ extObj = ".o" 230+ extCo = ".co" 231+ extCoRaw = ".co.raw" 232 233- for arch, archKernels in archs.items(): 234+ destDir = Path(ensurePath(os.path.join(outputPath, 'library'))) 235+ asmDir = Path(kernelWriterAssembly.getAssemblyDirectory()) 236+ 237+ archKernelMap = collections.defaultdict(list) 238+ for k in filter(isAsm, kernels): 239+ archKernelMap[tuple(k['ISA'])].append(k) 240+ 241+ coFiles = [] 242+ for arch, archKernels in archKernelMap.items(): 243 if len(archKernels) == 0: 244 continue 245 246- archName = getGfxName(arch) 247+ gfx = getGfxName(arch) 248 249 if globalParameters["MergeFiles"] or globalParameters["NumMergedFiles"] > 1 or globalParameters["LazyLibraryLoading"]: 250- objectFiles = [kernelWriterAssembly.getKernelFileBase(k) + '.o' for k in archKernels if 'codeObjectFile' not in k] 251+ objectFiles = [str(asmDir / (kernelWriterAssembly.getKernelFileBase(k) + extObj)) for k in archKernels if 'codeObjectFile' not in k] 252 253- #Group kernels from placeholder libraries 254 coFileMap = collections.defaultdict(list) 255+ 256 if len(objectFiles): 257- coFileMap[os.path.join(destDir, "TensileLibrary_"+archName+".co")] = objectFiles 258+ coFileMap[asmDir / ("TensileLibrary_"+ gfx + extCoRaw)] = objectFiles 259 260 for kernel in archKernels: 261 coName = kernel.get("codeObjectFile", None) 262 if coName: 263- coFileMap[os.path.join(destDir, coName+".co")] += [kernelWriterAssembly.getKernelFileBase(kernel) + '.o'] 264+ coFileMap[asmDir / (coName + extCoRaw)].append(str(asmDir / (kernelWriterAssembly.getKernelFileBase(kernel) + extObj))) 265 266- for coFile, objectFiles in coFileMap.items(): 267- if os.name == "nt": 268- # On Windows, the objectFiles list command line (including spaces) 269- # exceeds the limit of 8191 characters, so using response file 270+ for coFileRaw, objFiles in coFileMap.items(): 271 272- responseArgs = objectFiles 273- responseFile = os.path.join(asmDir, 'clangArgs.txt') 274- with open(responseFile, 'wt') as file: 275- file.write( " ".join(responseArgs) ) 276- file.flush() 277- 278- args = [globalParameters['AssemblerPath'], '-target', 'amdgcn-amd-amdhsa', '-o', coFile, '@clangArgs.txt'] 279- subprocess.check_call(args, cwd=asmDir) 280- else: 281- numOfObjectFiles = len(objectFiles) 282- splitFiles = 10000 283- if numOfObjectFiles > splitFiles: 284- slicedObjectFilesList = [objectFiles[x:x+splitFiles] for x in range(0, numOfObjectFiles, splitFiles)] 285- objectFileBasename = os.path.split(coFile)[-1].split('.')[0] 286- numOfOneSliceOfObjectFiles = int(math.ceil(numOfObjectFiles / splitFiles)) 287- newObjectFiles = [ objectFileBasename + "_" + str(i) + ".o" for i in range(0, numOfOneSliceOfObjectFiles)] 288- newObjectFilesOutput = [] 289- for slicedObjectFiles, objectFile in zip(slicedObjectFilesList, newObjectFiles): 290- if len(slicedObjectFiles) > 1: 291- args = [globalParameters["ROCmLdPath"], "-r"] + slicedObjectFiles + [ "-o", objectFile ] 292- if globalParameters["PrintCodeCommands"]: 293- print(asmDir) 294- print(' '.join(args)) 295- subprocess.check_call(args, cwd=asmDir) 296- newObjectFilesOutput.append(objectFile) 297- else: 298- newObjectFilesOutput.append(slicedObjectFiles[0]) 299- args = kernelWriterAssembly.getLinkCodeObjectArgs(newObjectFilesOutput, coFile) 300- if globalParameters["PrintCodeCommands"]: 301- print(asmDir) 302- print(' '.join(args)) 303- subprocess.check_call(args, cwd=asmDir) 304- else: 305- args = kernelWriterAssembly.getLinkCodeObjectArgs(objectFiles, coFile) 306- if globalParameters["PrintCodeCommands"]: 307- print(asmDir) 308- print(' '.join(args)) 309- subprocess.check_call(args, cwd=asmDir) 310+ linkIntoCodeObject(objFiles, coFileRaw, kernelWriterAssembly) 311+ coFile = destDir / coFileRaw.name.replace(extCoRaw, extCo) 312+ compressCodeObject(coFileRaw, coFile, gfx, globalParameters["ClangOffloadBundlerPath"]) 313 314 coFiles.append(coFile) 315 else: 316 # no mergefiles 317 def newCoFileName(kName): 318 if globalParameters["PackageLibrary"]: 319- return os.path.join(destDir, archName, kName + '.co') 320+ return os.path.join(destDir, gfx, kName + '.co') 321 else: 322- return os.path.join(destDir, kName + '_' + archName + '.co') 323+ return os.path.join(destDir, kName + '_' + gfx + '.co') 324 325 def orgCoFileName(kName): 326 return os.path.join(asmDir, kName + '.co') 327@@ -179,6 +234,8 @@ def orgCoFileName(kName): 328 map(lambda k: kernelWriterAssembly.getKernelFileBase(k), archKernels)), "Copying code objects"): 329 shutil.copyfile(src, dst) 330 coFiles.append(dst) 331+ printWarning("Code object files are not compressed in `--no-merge-files` build mode.") 332+ 333 return coFiles 334 335 def which(p): 336@@ -645,7 +702,7 @@ def success(kernel): 337 338 if not globalParameters["GenerateSourcesAndExit"]: 339 codeObjectFiles += buildSourceCodeObjectFiles(CxxCompiler, kernelFiles, outputPath) 340- codeObjectFiles += getAssemblyCodeObjectFiles(kernelsToBuild, kernelWriterAssembly, outputPath) 341+ codeObjectFiles += buildAssemblyCodeObjectFiles(kernelsToBuild, kernelWriterAssembly, outputPath) 342 343 Common.popWorkingPath() # build_tmp 344 Common.popWorkingPath() # workingDir 345