nixpkgs mirror (for testing)
github.com/NixOS/nixpkgs
nix
1diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py
2index b8cea84558..1bc24bd1dd 100644
3--- a/Tensile/TensileCreateLibrary.py
4+++ b/Tensile/TensileCreateLibrary.py
5@@ -41,6 +41,7 @@
6 from .SolutionLibrary import MasterSolutionLibrary
7 from .SolutionStructs import Solution
8 from .CustomYamlLoader import load_logic_gfx_arch
9+from .Utilities.Profile import profile
10
11 import argparse
12 import collections
13@@ -1233,7 +1234,7 @@ def validateLibrary(masterLibraries: MasterSolutionLibrary,
14 ################################################################################
15 # Tensile Create Library
16 ################################################################################
17-@timing
18+@profile
19 def TensileCreateLibrary():
20 print1("")
21 print1(HR)
22@@ -1558,7 +1559,6 @@ def param(key, value):
23
24 print1("# Check if generated files exists.")
25
26- @timing
27 def checkFileExistence(files):
28 for filePath in files:
29 if not os.path.exists(filePath):
30diff --git a/Tensile/Utilities/Profile.py b/Tensile/Utilities/Profile.py
31new file mode 100644
32index 0000000000..cc3c7eb44c
33--- /dev/null
34+++ b/Tensile/Utilities/Profile.py
35@@ -0,0 +1,77 @@
36+################################################################################
37+#
38+# Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
39+#
40+# Permission is hereby granted, free of charge, to any person obtaining a copy
41+# of this software and associated documentation files (the "Software"), to deal
42+# in the Software without restriction, including without limitation the rights
43+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
44+# copies of the Software, and to permit persons to whom the Software is
45+# furnished to do so, subject to the following conditions:
46+#
47+# The above copyright notice and this permission notice shall be included in
48+# all copies or substantial portions of the Software.
49+#
50+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
52+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
53+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
54+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
55+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
56+# SOFTWARE.
57+#
58+################################################################################
59+
60+import cProfile
61+import pstats
62+import os
63+
64+from pathlib import Path
65+from datetime import datetime, timezone
66+from typing import Callable, Tuple
67+
68+PROFILE_ENV_VAR = "TENSILE_PROFILE"
69+
70+def profile(func: Callable) -> Callable:
71+ """Profiling decorator.
72+
73+ Add ``@profile`` to mark a function for profiling; set the environment variable
74+ TENSILE_PROFILE=ON to enable profiling decorated functions.
75+ """
76+ if not envVariableIsSet(PROFILE_ENV_VAR):
77+ return func
78+ def wrapper(*args, **kwargs):
79+ path, filename = initProfileArtifacts(func.__name__)
80+
81+ prof = cProfile.Profile()
82+ output = prof.runcall(func, *args, **kwargs)
83+ result = pstats.Stats(prof)
84+ result.sort_stats(pstats.SortKey.TIME)
85+ result.dump_stats(path/filename)
86+
87+ return output
88+ return wrapper
89+
90+def envVariableIsSet(varName: str) -> bool:
91+ """Checks if the provided environment variable is set to "ON", "TRUE", or "1"
92+ Args:
93+ varName: Environment variable name.
94+ Returns:
95+ True if the environment variable is set, otherwise False.
96+ """
97+ value = os.environ.get(varName, "").upper()
98+ return True if value in ["ON", "TRUE", "1"] else False
99+
100+def initProfileArtifacts(funcName: str) -> Tuple[Path, str]:
101+ """Initializes filenames and paths for profiling artifacts based on the current datetime
102+ Args:
103+ funcName: The name of the function being profiled, nominally passed via func.__name__
104+ Returns:
105+ A tuple (path, filename) where the path is the artifact directory and filename is
106+ a .prof file with the profiling results.
107+ """
108+ dt = datetime.now(timezone.utc)
109+ filename = f"{funcName}-{dt.strftime('%Y-%m-%dT%H-%M-%SZ')}.prof"
110+ path = Path().cwd()/f"profiling-results-{dt.strftime('%Y-%m-%d')}"
111+ path.mkdir(exist_ok=True)
112+ return path, filename
113
114diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py
115index e62b0072df..2c843ba936 100644
116--- a/Tensile/TensileCreateLibrary.py
117+++ b/Tensile/TensileCreateLibrary.py
118@@ -56,7 +56,7 @@
119 import sys
120 from timeit import default_timer as timer
121 from pathlib import Path
122-from typing import Sequence, List
123+from typing import Sequence, List, Union
124
125 def timing(func):
126 def wrapper(*args, **kwargs):
127@@ -90,87 +90,142 @@ def processKernelSource(kernel, kernelWriterAssembly, ti):
128
129 return (err, src, header, kernelName, filename)
130
131-def getAssemblyCodeObjectFiles(kernels, kernelWriterAssembly, outputPath):
132- destDir = ensurePath(os.path.join(outputPath, 'library'))
133- asmDir = kernelWriterAssembly.getAssemblyDirectory()
134- archs = collections.defaultdict(list)
135+def linkIntoCodeObject(
136+ objFiles: List[str], coPathDest: Union[Path, str], kernelWriterAssembly: KernelWriterAssembly
137+):
138+ """Links object files into a code object file.
139
140- for k in filter(lambda k: k['KernelLanguage'] == 'Assembly', kernels):
141- archs[tuple(k['ISA'])].append(k)
142+ Args:
143+ objectFiles: A list of object files to be linked.
144+ coPathDest: The destination path for the code object file.
145+ kernelWriterAssembly: An instance of KernelWriterAssembly to get link arguments.
146
147- coFiles = []
148+ Raises:
149+ RuntimeError: If linker invocation fails.
150+ """
151+ if os.name == "nt":
152+ # On Windows, the objectFiles list command line (including spaces)
153+ # exceeds the limit of 8191 characters, so using response file
154+
155+ responseFile = os.path.join('/tmp', 'clangArgs.txt')
156+ with open(responseFile, 'wt') as file:
157+ file.write(" ".join(objFiles))
158+ file.flush()
159+
160+ args = [globalParameters['AssemblerPath'], '-target', 'amdgcn-amd-amdhsa', '-o', coFileRaw, '@clangArgs.txt']
161+ subprocess.check_call(args, cwd=asmDir)
162+ else:
163+ numObjFiles = len(objFiles)
164+ maxObjFiles = 10000
165+
166+ if numObjFiles > maxObjFiles:
167+ batchedObjFiles = [objFiles[i:i+maxObjFiles] for i in range(0, numObjFiles, maxObjFiles)]
168+ batchSize = int(math.ceil(numObjFiles / maxObjFiles))
169+
170+ newObjFiles = [str(coPathDest) + "." + str(i) for i in range(0, batchSize)]
171+ newObjFilesOutput = []
172+
173+ for batch, filename in zip(batchedObjFiles, newObjFiles):
174+ if len(batch) > 1:
175+ args = [globalParameters["ROCmLdPath"], "-r"] + batch + [ "-o", filename]
176+ print2(f"Linking object files into fewer object files: {' '.join(args)}")
177+ subprocess.check_call(args)
178+ newObjFilesOutput.append(filename)
179+ else:
180+ newObjFilesOutput.append(batchedObjFiles[0])
181+
182+ args = kernelWriterAssembly.getLinkCodeObjectArgs(newObjFilesOutput, str(coPathDest))
183+ print2(f"Linking object files into code object: {' '.join(args)}")
184+ subprocess.check_call(args)
185+ else:
186+ args = kernelWriterAssembly.getLinkCodeObjectArgs(objFiles, str(coPathDest))
187+ print2(f"Linking object files into code object: {' '.join(args)}")
188+ subprocess.check_call(args)
189+
190+
191+def compressCodeObject(
192+ coPathSrc: Union[Path, str], coPathDest: Union[Path, str], gfx: str, bundler: str
193+):
194+ """Compresses a code object file using the provided bundler.
195+
196+ Args:
197+ coPathSrc: The source path of the code object file to be compressed.
198+ coPathDest: The destination path for the compressed code object file.
199+ gfx: The target GPU architecture.
200+ bundler: The path to the Clang Offload Bundler executable.
201+
202+ Raises:
203+ RuntimeError: If compressing the code object file fails.
204+ """
205+ args = [
206+ bundler,
207+ "--compress",
208+ "--type=o",
209+ "--bundle-align=4096",
210+ f"--targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--{gfx}",
211+ "--input=/dev/null",
212+ f"--input={str(coPathSrc)}",
213+ f"--output={str(coPathDest)}",
214+ ]
215+
216+ print2(f"Bundling/compressing code objects: {' '.join(args)}")
217+ try:
218+ out = subprocess.check_output(args, stderr=subprocess.STDOUT)
219+ print2(f"Output: {out}")
220+ except subprocess.CalledProcessError as err:
221+ raise RuntimeError(
222+ f"Error compressing code object via bundling: {err.output}\nFailed command: {' '.join(args)}"
223+ )
224+
225+def buildAssemblyCodeObjectFiles(kernels, kernelWriterAssembly, outputPath):
226+
227+ isAsm = lambda k: k["KernelLanguage"] == "Assembly"
228+
229+ extObj = ".o"
230+ extCo = ".co"
231+ extCoRaw = ".co.raw"
232
233- for arch, archKernels in archs.items():
234+ destDir = Path(ensurePath(os.path.join(outputPath, 'library')))
235+ asmDir = Path(kernelWriterAssembly.getAssemblyDirectory())
236+
237+ archKernelMap = collections.defaultdict(list)
238+ for k in filter(isAsm, kernels):
239+ archKernelMap[tuple(k['ISA'])].append(k)
240+
241+ coFiles = []
242+ for arch, archKernels in archKernelMap.items():
243 if len(archKernels) == 0:
244 continue
245
246- archName = getGfxName(arch)
247+ gfx = getGfxName(arch)
248
249 if globalParameters["MergeFiles"] or globalParameters["NumMergedFiles"] > 1 or globalParameters["LazyLibraryLoading"]:
250- objectFiles = [kernelWriterAssembly.getKernelFileBase(k) + '.o' for k in archKernels if 'codeObjectFile' not in k]
251+ objectFiles = [str(asmDir / (kernelWriterAssembly.getKernelFileBase(k) + extObj)) for k in archKernels if 'codeObjectFile' not in k]
252
253- #Group kernels from placeholder libraries
254 coFileMap = collections.defaultdict(list)
255+
256 if len(objectFiles):
257- coFileMap[os.path.join(destDir, "TensileLibrary_"+archName+".co")] = objectFiles
258+ coFileMap[asmDir / ("TensileLibrary_"+ gfx + extCoRaw)] = objectFiles
259
260 for kernel in archKernels:
261 coName = kernel.get("codeObjectFile", None)
262 if coName:
263- coFileMap[os.path.join(destDir, coName+".co")] += [kernelWriterAssembly.getKernelFileBase(kernel) + '.o']
264+ coFileMap[asmDir / (coName + extCoRaw)].append(str(asmDir / (kernelWriterAssembly.getKernelFileBase(kernel) + extObj)))
265
266- for coFile, objectFiles in coFileMap.items():
267- if os.name == "nt":
268- # On Windows, the objectFiles list command line (including spaces)
269- # exceeds the limit of 8191 characters, so using response file
270+ for coFileRaw, objFiles in coFileMap.items():
271
272- responseArgs = objectFiles
273- responseFile = os.path.join(asmDir, 'clangArgs.txt')
274- with open(responseFile, 'wt') as file:
275- file.write( " ".join(responseArgs) )
276- file.flush()
277-
278- args = [globalParameters['AssemblerPath'], '-target', 'amdgcn-amd-amdhsa', '-o', coFile, '@clangArgs.txt']
279- subprocess.check_call(args, cwd=asmDir)
280- else:
281- numOfObjectFiles = len(objectFiles)
282- splitFiles = 10000
283- if numOfObjectFiles > splitFiles:
284- slicedObjectFilesList = [objectFiles[x:x+splitFiles] for x in range(0, numOfObjectFiles, splitFiles)]
285- objectFileBasename = os.path.split(coFile)[-1].split('.')[0]
286- numOfOneSliceOfObjectFiles = int(math.ceil(numOfObjectFiles / splitFiles))
287- newObjectFiles = [ objectFileBasename + "_" + str(i) + ".o" for i in range(0, numOfOneSliceOfObjectFiles)]
288- newObjectFilesOutput = []
289- for slicedObjectFiles, objectFile in zip(slicedObjectFilesList, newObjectFiles):
290- if len(slicedObjectFiles) > 1:
291- args = [globalParameters["ROCmLdPath"], "-r"] + slicedObjectFiles + [ "-o", objectFile ]
292- if globalParameters["PrintCodeCommands"]:
293- print(asmDir)
294- print(' '.join(args))
295- subprocess.check_call(args, cwd=asmDir)
296- newObjectFilesOutput.append(objectFile)
297- else:
298- newObjectFilesOutput.append(slicedObjectFiles[0])
299- args = kernelWriterAssembly.getLinkCodeObjectArgs(newObjectFilesOutput, coFile)
300- if globalParameters["PrintCodeCommands"]:
301- print(asmDir)
302- print(' '.join(args))
303- subprocess.check_call(args, cwd=asmDir)
304- else:
305- args = kernelWriterAssembly.getLinkCodeObjectArgs(objectFiles, coFile)
306- if globalParameters["PrintCodeCommands"]:
307- print(asmDir)
308- print(' '.join(args))
309- subprocess.check_call(args, cwd=asmDir)
310+ linkIntoCodeObject(objFiles, coFileRaw, kernelWriterAssembly)
311+ coFile = destDir / coFileRaw.name.replace(extCoRaw, extCo)
312+ compressCodeObject(coFileRaw, coFile, gfx, globalParameters["ClangOffloadBundlerPath"])
313
314 coFiles.append(coFile)
315 else:
316 # no mergefiles
317 def newCoFileName(kName):
318 if globalParameters["PackageLibrary"]:
319- return os.path.join(destDir, archName, kName + '.co')
320+ return os.path.join(destDir, gfx, kName + '.co')
321 else:
322- return os.path.join(destDir, kName + '_' + archName + '.co')
323+ return os.path.join(destDir, kName + '_' + gfx + '.co')
324
325 def orgCoFileName(kName):
326 return os.path.join(asmDir, kName + '.co')
327@@ -179,6 +234,8 @@ def orgCoFileName(kName):
328 map(lambda k: kernelWriterAssembly.getKernelFileBase(k), archKernels)), "Copying code objects"):
329 shutil.copyfile(src, dst)
330 coFiles.append(dst)
331+ printWarning("Code object files are not compressed in `--no-merge-files` build mode.")
332+
333 return coFiles
334
335 def which(p):
336@@ -645,7 +702,7 @@ def success(kernel):
337
338 if not globalParameters["GenerateSourcesAndExit"]:
339 codeObjectFiles += buildSourceCodeObjectFiles(CxxCompiler, kernelFiles, outputPath)
340- codeObjectFiles += getAssemblyCodeObjectFiles(kernelsToBuild, kernelWriterAssembly, outputPath)
341+ codeObjectFiles += buildAssemblyCodeObjectFiles(kernelsToBuild, kernelWriterAssembly, outputPath)
342
343 Common.popWorkingPath() # build_tmp
344 Common.popWorkingPath() # workingDir
345