Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9"""
10
11import re
12
13# Local cache for regular expressions
14re_cache = {}
15
16
17class KernRe:
18 """
19 Helper class to simplify regex declaration and usage.
20
21 It calls re.compile for a given pattern. It also allows adding
22 regular expressions and define sub at class init time.
23
24 Regular expressions can be cached via an argument, helping to speedup
25 searches.
26 """
27
28 def _add_regex(self, string, flags):
29 """
30 Adds a new regex or reuses it from the cache.
31 """
32 self.regex = re_cache.get(string, None)
33 if not self.regex:
34 self.regex = re.compile(string, flags=flags)
35 if self.cache:
36 re_cache[string] = self.regex
37
38 def __init__(self, string, cache=True, flags=0):
39 """
40 Compile a regular expression and initialize internal vars.
41 """
42
43 self.cache = cache
44 self.last_match = None
45
46 self._add_regex(string, flags)
47
48 def __str__(self):
49 """
50 Return the regular expression pattern.
51 """
52 return self.regex.pattern
53
54 def __add__(self, other):
55 """
56 Allows adding two regular expressions into one.
57 """
58
59 return KernRe(str(self) + str(other), cache=self.cache or other.cache,
60 flags=self.regex.flags | other.regex.flags)
61
62 def match(self, string):
63 """
64 Handles a re.match storing its results
65 """
66
67 self.last_match = self.regex.match(string)
68 return self.last_match
69
70 def search(self, string):
71 """
72 Handles a re.search storing its results
73 """
74
75 self.last_match = self.regex.search(string)
76 return self.last_match
77
78 def findall(self, string):
79 """
80 Alias to re.findall
81 """
82
83 return self.regex.findall(string)
84
85 def split(self, string):
86 """
87 Alias to re.split
88 """
89
90 return self.regex.split(string)
91
92 def sub(self, sub, string, count=0):
93 """
94 Alias to re.sub
95 """
96
97 return self.regex.sub(sub, string, count=count)
98
99 def group(self, num):
100 """
101 Returns the group results of the last match
102 """
103
104 return self.last_match.group(num)
105
106
107class NestedMatch:
108 """
109 Finding nested delimiters is hard with regular expressions. It is
110 even harder on Python with its normal re module, as there are several
111 advanced regular expressions that are missing.
112
113 This is the case of this pattern:
114
115 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
116
117 which is used to properly match open/close parentheses of the
118 string search STRUCT_GROUP(),
119
120 Add a class that counts pairs of delimiters, using it to match and
121 replace nested expressions.
122
123 The original approach was suggested by:
124 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
125
126 Although I re-implemented it to make it more generic and match 3 types
127 of delimiters. The logic checks if delimiters are paired. If not, it
128 will ignore the search string.
129 """
130
131 # TODO: make NestedMatch handle multiple match groups
132 #
133 # Right now, regular expressions to match it are defined only up to
134 # the start delimiter, e.g.:
135 #
136 # \bSTRUCT_GROUP\(
137 #
138 # is similar to: STRUCT_GROUP\((.*)\)
139 # except that the content inside the match group is delimiter-aligned.
140 #
141 # The content inside parentheses is converted into a single replace
142 # group (e.g. r`\1').
143 #
144 # It would be nice to change such definition to support multiple
145 # match groups, allowing a regex equivalent to:
146 #
147 # FOO\((.*), (.*), (.*)\)
148 #
149 # it is probably easier to define it not as a regular expression, but
150 # with some lexical definition like:
151 #
152 # FOO(arg1, arg2, arg3)
153
154 DELIMITER_PAIRS = {
155 '{': '}',
156 '(': ')',
157 '[': ']',
158 }
159
160 RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
161
162 def _search(self, regex, line):
163 """
164 Finds paired blocks for a regex that ends with a delimiter.
165
166 The suggestion of using finditer to match pairs came from:
167 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
168 but I ended using a different implementation to align all three types
169 of delimiters and seek for an initial regular expression.
170
171 The algorithm seeks for open/close paired delimiters and places them
172 into a stack, yielding a start/stop position of each match when the
173 stack is zeroed.
174
175 The algorithm should work fine for properly paired lines, but will
176 silently ignore end delimiters that precede a start delimiter.
177 This should be OK for kernel-doc parser, as unaligned delimiters
178 would cause compilation errors. So, we don't need to raise exceptions
179 to cover such issues.
180 """
181
182 stack = []
183
184 for match_re in regex.finditer(line):
185 start = match_re.start()
186 offset = match_re.end()
187
188 d = line[offset - 1]
189 if d not in self.DELIMITER_PAIRS:
190 continue
191
192 end = self.DELIMITER_PAIRS[d]
193 stack.append(end)
194
195 for match in self.RE_DELIM.finditer(line[offset:]):
196 pos = match.start() + offset
197
198 d = line[pos]
199
200 if d in self.DELIMITER_PAIRS:
201 end = self.DELIMITER_PAIRS[d]
202
203 stack.append(end)
204 continue
205
206 # Does the end delimiter match what is expected?
207 if stack and d == stack[-1]:
208 stack.pop()
209
210 if not stack:
211 yield start, offset, pos + 1
212 break
213
214 def search(self, regex, line):
215 """
216 This is similar to re.search:
217
218 It matches a regex that it is followed by a delimiter,
219 returning occurrences only if all delimiters are paired.
220 """
221
222 for t in self._search(regex, line):
223
224 yield line[t[0]:t[2]]
225
226 def sub(self, regex, sub, line, count=0):
227 """
228 This is similar to re.sub:
229
230 It matches a regex that it is followed by a delimiter,
231 replacing occurrences only if all delimiters are paired.
232
233 if r'\1' is used, it works just like re: it places there the
234 matched paired data with the delimiter stripped.
235
236 If count is different than zero, it will replace at most count
237 items.
238 """
239 out = ""
240
241 cur_pos = 0
242 n = 0
243
244 for start, end, pos in self._search(regex, line):
245 out += line[cur_pos:start]
246
247 # Value, ignoring start/end delimiters
248 value = line[end:pos - 1]
249
250 # replaces \1 at the sub string, if \1 is used there
251 new_sub = sub
252 new_sub = new_sub.replace(r'\1', value)
253
254 out += new_sub
255
256 # Drop end ';' if any
257 if line[pos] == ';':
258 pos += 1
259
260 cur_pos = pos
261 n += 1
262
263 if count and count >= n:
264 break
265
266 # Append the remaining string
267 l = len(line)
268 out += line[cur_pos:l]
269
270 return out