this repo has no description

Add more robust support for line number, column number and byte number tracking to the lexer (#227)

The first batch of commits in this pull request involve minor refactors
that were found to be potentially worthwhile in the course of
development.

The latter half of this PR adds code in service of eventually closing
#86. The main additions are the classes `SourceLocation` and
`SourceExtent`. `class SourceLocation` is a dataclass that stores a line
number, column number, and byte index into the scrapscript program
source.

`class SourceExtent` has `start` and `end` members of type
`SourceLocation` to represent a swathe of contiguous source code from
the user's scrapscript program. The line and column numbers are
1-indexed as before whereas the byte numbers are 0-indexed. All end
indices are inclusive of the final characters of their corresponding
tokens, i.e. they do *not* demarcate the positions of "one past the end"
of the tokens.

The classes `Token` and `Lexer` are modified to work in terms of
`SourceExtent` so that the lexing phase can maintain richer source
information. This new code structure will hopefully be more robust and
hopefully allow for easier refactoring in the future if needed.

I expect to do a follow-up PR that can build atop this one to maintain
richer source information tracking for the parsing stage.

authored by

Abel Sen and committed by
GitHub
3f8cd030 7080f5a0

+356 -154
+19 -95
poetry.lock
··· 1 - # This file is automatically @generated by Poetry and should not be changed by hand. 1 + # This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand. 2 2 3 3 [[package]] 4 4 name = "astroid" 5 5 version = "3.2.2" 6 6 description = "An abstract syntax tree for Python with inference support." 7 - category = "dev" 8 7 optional = false 9 8 python-versions = ">=3.8.0" 9 + groups = ["dev"] 10 10 files = [ 11 11 {file = "astroid-3.2.2-py3-none-any.whl", hash = "sha256:e8a0083b4bb28fcffb6207a3bfc9e5d0a68be951dd7e336d5dcf639c682388c0"}, 12 12 {file = "astroid-3.2.2.tar.gz", hash = "sha256:8ead48e31b92b2e217b6c9733a21afafe479d52d6e164dd25fb1a770c7c3cf94"}, ··· 19 19 name = "colorama" 20 20 version = "0.4.6" 21 21 description = "Cross-platform colored terminal text." 22 - category = "dev" 23 22 optional = false 24 23 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 24 + groups = ["dev"] 25 + markers = "sys_platform == \"win32\"" 25 26 files = [ 26 27 {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, 27 28 {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ··· 31 32 name = "dill" 32 33 version = "0.3.8" 33 34 description = "serialize all of Python" 34 - category = "dev" 35 35 optional = false 36 36 python-versions = ">=3.8" 37 + groups = ["dev"] 37 38 files = [ 38 39 {file = "dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7"}, 39 40 {file = "dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca"}, ··· 44 45 profile = ["gprof2dot (>=2022.7.29)"] 45 46 46 47 [[package]] 47 - name = "exceptiongroup" 48 - version = "1.2.1" 49 - description = "Backport of PEP 654 (exception groups)" 50 - category = "dev" 51 - optional = false 52 - python-versions = ">=3.7" 53 - files = [ 54 - {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"}, 55 - {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"}, 56 - ] 57 - 58 - [package.extras] 59 - test = ["pytest (>=6)"] 60 - 61 - [[package]] 62 - name = "iniconfig" 63 - version = "2.0.0" 64 - description = "brain-dead simple config-ini parsing" 65 - category = "dev" 66 - optional = false 67 - python-versions = ">=3.7" 68 - files = [ 69 - {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, 70 - {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, 71 - ] 72 - 73 - [[package]] 74 48 name = "isort" 75 49 version = "5.13.2" 76 50 description = "A Python utility / library to sort Python imports." 77 - category = "dev" 78 51 optional = false 79 52 python-versions = ">=3.8.0" 53 + groups = ["dev"] 80 54 files = [ 81 55 {file = "isort-5.13.2-py3-none-any.whl", hash = "sha256:8ca5e72a8d85860d5a3fa69b8745237f2939afe12dbf656afbcb47fe72d947a6"}, 82 56 {file = "isort-5.13.2.tar.gz", hash = "sha256:48fdfcb9face5d58a4f6dde2e72a1fb8dcaf8ab26f95ab49fab84c2ddefb0109"}, ··· 89 63 name = "mccabe" 90 64 version = "0.7.0" 91 65 description = "McCabe checker, plugin for flake8" 92 - category = "dev" 93 66 optional = false 94 67 python-versions = ">=3.6" 68 + groups = ["dev"] 95 69 files = [ 96 70 {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, 97 71 {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ··· 101 75 name = "mypy" 102 76 version = "1.10.1" 103 77 description = "Optional static typing for Python" 104 - category = "dev" 105 78 optional = false 106 79 python-versions = ">=3.8" 80 + groups = ["dev"] 107 81 files = [ 108 82 {file = "mypy-1.10.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e36f229acfe250dc660790840916eb49726c928e8ce10fbdf90715090fe4ae02"}, 109 83 {file = "mypy-1.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:51a46974340baaa4145363b9e051812a2446cf583dfaeba124af966fa44593f7"}, ··· 149 123 name = "mypy-extensions" 150 124 version = "1.0.0" 151 125 description = "Type system extensions for programs checked with the mypy type checker." 152 - category = "dev" 153 126 optional = false 154 127 python-versions = ">=3.5" 128 + groups = ["dev"] 155 129 files = [ 156 130 {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, 157 131 {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, 158 132 ] 159 133 160 134 [[package]] 161 - name = "packaging" 162 - version = "24.1" 163 - description = "Core utilities for Python packages" 164 - category = "dev" 165 - optional = false 166 - python-versions = ">=3.8" 167 - files = [ 168 - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, 169 - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, 170 - ] 171 - 172 - [[package]] 173 135 name = "platformdirs" 174 136 version = "4.2.2" 175 137 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." 176 - category = "dev" 177 138 optional = false 178 139 python-versions = ">=3.8" 140 + groups = ["dev"] 179 141 files = [ 180 142 {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"}, 181 143 {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"}, ··· 187 149 type = ["mypy (>=1.8)"] 188 150 189 151 [[package]] 190 - name = "pluggy" 191 - version = "1.5.0" 192 - description = "plugin and hook calling mechanisms for python" 193 - category = "dev" 194 - optional = false 195 - python-versions = ">=3.8" 196 - files = [ 197 - {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, 198 - {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, 199 - ] 200 - 201 - [package.extras] 202 - dev = ["pre-commit", "tox"] 203 - testing = ["pytest", "pytest-benchmark"] 204 - 205 - [[package]] 206 152 name = "pylint" 207 153 version = "3.2.4" 208 154 description = "python code static checker" 209 - category = "dev" 210 155 optional = false 211 156 python-versions = ">=3.8.0" 157 + groups = ["dev"] 212 158 files = [ 213 159 {file = "pylint-3.2.4-py3-none-any.whl", hash = "sha256:43b8ffdf1578e4e4439fa1f6ace402281f5dd61999192280fa12fe411bef2999"}, 214 160 {file = "pylint-3.2.4.tar.gz", hash = "sha256:5753d27e49a658b12a48c2883452751a2ecfc7f38594e0980beb03a6e77e6f86"}, ··· 219 165 colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} 220 166 dill = [ 221 167 {version = ">=0.2", markers = "python_version < \"3.11\""}, 222 - {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, 223 168 {version = ">=0.3.7", markers = "python_version >= \"3.12\""}, 169 + {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, 224 170 ] 225 171 isort = ">=4.2.5,<5.13.0 || >5.13.0,<6" 226 172 mccabe = ">=0.6,<0.8" ··· 234 180 testutils = ["gitpython (>3)"] 235 181 236 182 [[package]] 237 - name = "pytest" 238 - version = "8.2.2" 239 - description = "pytest: simple powerful testing with Python" 240 - category = "dev" 241 - optional = false 242 - python-versions = ">=3.8" 243 - files = [ 244 - {file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"}, 245 - {file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"}, 246 - ] 247 - 248 - [package.dependencies] 249 - colorama = {version = "*", markers = "sys_platform == \"win32\""} 250 - exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} 251 - iniconfig = "*" 252 - packaging = "*" 253 - pluggy = ">=1.5,<2.0" 254 - tomli = {version = ">=1", markers = "python_version < \"3.11\""} 255 - 256 - [package.extras] 257 - dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] 258 - 259 - [[package]] 260 183 name = "ruff" 261 184 version = "0.5.0" 262 185 description = "An extremely fast Python linter and code formatter, written in Rust." 263 - category = "dev" 264 186 optional = false 265 187 python-versions = ">=3.7" 188 + groups = ["dev"] 266 189 files = [ 267 190 {file = "ruff-0.5.0-py3-none-linux_armv6l.whl", hash = "sha256:ee770ea8ab38918f34e7560a597cc0a8c9a193aaa01bfbd879ef43cb06bd9c4c"}, 268 191 {file = "ruff-0.5.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:38f3b8327b3cb43474559d435f5fa65dacf723351c159ed0dc567f7ab735d1b6"}, ··· 288 211 name = "tomli" 289 212 version = "2.0.1" 290 213 description = "A lil' TOML parser" 291 - category = "dev" 292 214 optional = false 293 215 python-versions = ">=3.7" 216 + groups = ["dev"] 217 + markers = "python_version < \"3.11\"" 294 218 files = [ 295 219 {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, 296 220 {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ··· 300 224 name = "tomlkit" 301 225 version = "0.12.5" 302 226 description = "Style preserving TOML library" 303 - category = "dev" 304 227 optional = false 305 228 python-versions = ">=3.7" 229 + groups = ["dev"] 306 230 files = [ 307 231 {file = "tomlkit-0.12.5-py3-none-any.whl", hash = "sha256:af914f5a9c59ed9d0762c7b64d3b5d5df007448eb9cd2edc8a46b1eafead172f"}, 308 232 {file = "tomlkit-0.12.5.tar.gz", hash = "sha256:eef34fba39834d4d6b73c9ba7f3e4d1c417a4e56f89a7e96e090dd0d24b8fb3c"}, ··· 312 236 name = "typing-extensions" 313 237 version = "4.12.2" 314 238 description = "Backported and Experimental Type Hints for Python 3.8+" 315 - category = "dev" 316 239 optional = false 317 240 python-versions = ">=3.8" 241 + groups = ["dev"] 318 242 files = [ 319 243 {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, 320 244 {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, 321 245 ] 322 246 323 247 [metadata] 324 - lock-version = "2.0" 248 + lock-version = "2.1" 325 249 python-versions = ">=3.8" 326 - content-hash = "14b1642a39cb2d76915d55eb1a56f285beebe35f6ddcc97ac70e05b5fefd00db" 250 + content-hash = "3f6fef66d0d0471a2e30ff1b039615bdc1dbbec5e2aa73589c4461d9e124e546"
-1
pyproject.toml
··· 26 26 [tool.poetry.group.dev.dependencies] 27 27 mypy = "^1.10.0" 28 28 pylint = "^3.2.0" 29 - pytest = "^8.2.0" 30 29 ruff = "^0.5.0" 31 30 32 31 [tool.poetry.scripts]
+337 -58
scrapscript.py
··· 34 34 return c.isalnum() or c in ("$", "'", "_") 35 35 36 36 37 + @dataclass(eq=True, unsafe_hash=True) 38 + class SourceLocation: 39 + lineno: int = dataclasses.field(default=-1) 40 + colno: int = dataclasses.field(default=-1) 41 + byteno: int = dataclasses.field(default=-1) 42 + 43 + 44 + @dataclass(eq=True, unsafe_hash=True) 45 + class SourceExtent: 46 + start: SourceLocation = dataclasses.field(default_factory=SourceLocation) 47 + end: SourceLocation = dataclasses.field(default_factory=SourceLocation) 48 + 49 + 37 50 @dataclass(eq=True) 38 51 class Token: 39 - lineno: int = dataclasses.field(default=-1, init=False, compare=False) 52 + source_extent: SourceExtent = dataclasses.field(default_factory=SourceExtent, init=False, compare=False) 40 53 41 54 42 55 @dataclass(eq=True) ··· 107 120 108 121 109 122 @dataclass(eq=True) 110 - class VariantToken(Token): 111 - value: str 123 + class Hash(Token): 124 + # # 125 + pass 112 126 113 127 114 128 @dataclass(eq=True) ··· 116 130 pass 117 131 118 132 133 + def num_bytes_as_utf8(s: str) -> int: 134 + return len(s.encode(encoding="UTF-8")) 135 + 136 + 119 137 class Lexer: 120 138 def __init__(self, text: str): 121 139 self.text: str = text 122 140 self.idx: int = 0 123 - self.lineno: int = 1 124 - self.colno: int = 1 141 + self._lineno: int = 1 142 + self._colno: int = 1 125 143 self.line: str = "" 144 + self._byteno: int = 0 145 + self.current_token_source_extent: SourceExtent = SourceExtent( 146 + start=SourceLocation( 147 + lineno=self._lineno, 148 + colno=self._colno, 149 + byteno=self._byteno, 150 + ), 151 + end=SourceLocation( 152 + lineno=self._lineno, 153 + colno=self._colno, 154 + byteno=self._byteno, 155 + ), 156 + ) 157 + self.token_start_idx: int = self.idx 158 + self.token_end_idx: int = self.token_start_idx 159 + 160 + @property 161 + def lineno(self) -> int: 162 + return self._lineno 163 + 164 + @property 165 + def colno(self) -> int: 166 + return self._colno 167 + 168 + @property 169 + def byteno(self) -> int: 170 + return self._byteno 171 + 172 + def mark_token_start(self) -> None: 173 + self.current_token_source_extent.start.lineno = self._lineno 174 + self.current_token_source_extent.start.colno = self._colno 175 + self.current_token_source_extent.start.byteno = self._byteno 176 + self.token_start_idx = self.idx 177 + 178 + def mark_token_end(self) -> None: 179 + self.current_token_source_extent.end.lineno = self._lineno 180 + self.current_token_source_extent.end.colno = self._colno 181 + self.current_token_source_extent.end.byteno = self._byteno 182 + self.token_end_idx = self.idx 126 183 127 184 def has_input(self) -> bool: 128 185 return self.idx < len(self.text) 129 186 130 187 def read_char(self) -> str: 188 + self.mark_token_end() 131 189 c = self.peek_char() 132 190 if c == "\n": 133 - self.lineno += 1 134 - self.colno = 1 191 + self._lineno += 1 192 + self._colno = 1 135 193 self.line = "" 136 194 else: 137 195 self.line += c 138 - self.colno += 1 196 + self._colno += 1 139 197 self.idx += 1 198 + self._byteno += num_bytes_as_utf8(c) 140 199 return c 141 200 142 201 def peek_char(self) -> str: ··· 146 205 147 206 def make_token(self, cls: type, *args: Any) -> Token: 148 207 result: Token = cls(*args) 149 - result.lineno = self.lineno 208 + 209 + # Set start of token's source extent 210 + result.source_extent.start.lineno = self.current_token_source_extent.start.lineno 211 + result.source_extent.start.colno = self.current_token_source_extent.start.colno 212 + result.source_extent.start.byteno = self.current_token_source_extent.start.byteno 213 + 214 + # Set end of token's source extent 215 + result.source_extent.end.colno = self.current_token_source_extent.end.colno 216 + result.source_extent.end.lineno = self.current_token_source_extent.end.lineno 217 + result.source_extent.end.byteno = self.current_token_source_extent.end.byteno 218 + 150 219 return result 151 220 152 - def read_one(self) -> Token: 221 + def read_token(self) -> Token: 222 + # Consume all whitespace 153 223 while self.has_input(): 224 + # Keep updating the token start location until we exhaust all whitespace 225 + self.mark_token_start() 154 226 c = self.read_char() 155 227 if not c.isspace(): 156 228 break ··· 161 233 if c == "-": 162 234 if self.has_input() and self.peek_char() == "-": 163 235 self.read_comment() 164 - return self.read_one() 236 + # Need to start reading a new token 237 + return self.read_token() 165 238 return self.read_op(c) 166 239 if c == "#": 167 - value = self.read_one() 168 - if isinstance(value, EOF): 169 - raise UnexpectedEOFError("while reading symbol") 170 - if not isinstance(value, Name): 171 - raise ParseError(f"expected name after #, got {value!r}") 172 - return self.make_token(VariantToken, value.value) 240 + return self.make_token(Hash) 173 241 if c == "~": 174 242 if self.has_input() and self.peek_char() == "~": 175 243 self.read_char() ··· 191 259 return self.read_op(c) 192 260 if is_identifier_char(c): 193 261 return self.read_var(c) 194 - raise ParseError(f"unexpected token {c!r}", ("<input>", self.lineno, self.colno, self.line)) 262 + raise InvalidTokenError( 263 + SourceExtent( 264 + start=SourceLocation( 265 + lineno=self.current_token_source_extent.start.lineno, 266 + colno=self.current_token_source_extent.start.colno, 267 + byteno=self.current_token_source_extent.start.byteno, 268 + ), 269 + end=SourceLocation( 270 + lineno=self.current_token_source_extent.end.lineno, 271 + colno=self.current_token_source_extent.end.colno, 272 + byteno=self.current_token_source_extent.end.byteno, 273 + ), 274 + ) 275 + ) 195 276 196 277 def read_string(self) -> Token: 197 278 buf = "" ··· 252 333 def read_bytes(self) -> Token: 253 334 buf = "" 254 335 while self.has_input(): 255 - if (c := self.read_char()).isspace(): 336 + if self.peek_char().isspace(): 256 337 break 257 - buf += c 338 + buf += self.read_char() 258 339 base, _, value = buf.rpartition("'") 259 340 return self.make_token(BytesLit, value, int(base) if base else 64) 260 341 ··· 262 343 def tokenize(x: str) -> typing.List[Token]: 263 344 lexer = Lexer(x) 264 345 tokens = [] 265 - while (token := lexer.read_one()) and not isinstance(token, EOF): 346 + while (token := lexer.read_token()) and not isinstance(token, EOF): 266 347 tokens.append(token) 267 348 return tokens 268 349 ··· 341 422 assert " " not in OPER_CHARS 342 423 343 424 344 - class ParseError(SyntaxError): 425 + class SyntacticError(Exception): 345 426 pass 427 + 428 + 429 + class ParseError(SyntacticError): 430 + pass 431 + 432 + 433 + @dataclass(eq=True, frozen=True, unsafe_hash=True) 434 + class UnexpectedTokenError(ParseError): 435 + unexpected_token: Token 436 + 437 + 438 + @dataclass(eq=True, frozen=True, unsafe_hash=True) 439 + class InvalidTokenError(ParseError): 440 + unexpected_token: SourceExtent = dataclasses.field(default_factory=SourceExtent, compare=False) 346 441 347 442 348 443 # TODO(max): Replace with EOFError? ··· 383 478 elif isinstance(token, Name): 384 479 # TODO: Handle kebab case vars 385 480 return Var(token.value) 386 - elif isinstance(token, VariantToken): 387 - # It needs to be higher than the precedence of the -> operator so that 388 - # we can match variants in MatchFunction 389 - # It needs to be higher than the precedence of the && operator so that 390 - # we can use #true() and #false() in boolean expressions 391 - # It needs to be higher than the precedence of juxtaposition so that 392 - # f #true() #false() is parsed as f(TRUE)(FALSE) 393 - return Variant(token.value, parse_binary(tokens, PS[""].pr + 1)) 481 + elif isinstance(token, Hash): 482 + if tokens and isinstance(variant := tokens[0], Name): 483 + tokens.pop(0) 484 + # It needs to be higher than the precedence of the -> operator so that 485 + # we can match variants in MatchFunction 486 + # It needs to be higher than the precedence of the && operator so that 487 + # we can use #true() and #false() in boolean expressions 488 + # It needs to be higher than the precedence of juxtaposition so that 489 + # f #true() #false() is parsed as f(TRUE)(FALSE) 490 + return Variant(variant.value, parse_binary(tokens, PS[""].pr + 1)) 491 + elif tokens: 492 + raise UnexpectedTokenError(variant) 493 + else: 494 + raise UnexpectedEOFError("unexpected end of input") 394 495 elif isinstance(token, BytesLit): 395 496 base = token.base 396 497 if base == 85: ··· 475 576 return Float(-r.value) 476 577 return Binop(BinopKind.SUB, Int(0), r) 477 578 else: 478 - raise ParseError(f"unexpected token {token!r}") 579 + raise UnexpectedTokenError(token) 479 580 480 581 481 582 def parse_binary(tokens: typing.List[Token], p: float) -> "Object": ··· 1741 1842 self.assertEqual(l.lineno, 2) 1742 1843 self.assertEqual(l.colno, 1) 1743 1844 1845 + def test_read_char_increments_byteno(self) -> None: 1846 + l = Lexer("abc") 1847 + l.read_char() 1848 + self.assertEqual(l.byteno, 1) 1849 + l.read_char() 1850 + self.assertEqual(l.byteno, 2) 1851 + l.read_char() 1852 + self.assertEqual(l.byteno, 3) 1853 + 1744 1854 def test_read_char_appends_to_line(self) -> None: 1745 1855 l = Lexer("ab\nc") 1746 1856 l.read_char() ··· 1749 1859 l.read_char() 1750 1860 self.assertEqual(l.line, "") 1751 1861 1752 - def test_read_one_sets_lineno(self) -> None: 1862 + def test_read_token_sets_start_and_end_linenos(self) -> None: 1753 1863 l = Lexer("a b \n c d") 1754 - a = l.read_one() 1755 - b = l.read_one() 1756 - c = l.read_one() 1757 - d = l.read_one() 1758 - self.assertEqual(a.lineno, 1) 1759 - self.assertEqual(b.lineno, 1) 1760 - self.assertEqual(c.lineno, 2) 1761 - self.assertEqual(d.lineno, 2) 1864 + a = l.read_token() 1865 + b = l.read_token() 1866 + c = l.read_token() 1867 + d = l.read_token() 1868 + 1869 + self.assertEqual(a.source_extent.start.lineno, 1) 1870 + self.assertEqual(a.source_extent.end.lineno, 1) 1871 + 1872 + self.assertEqual(b.source_extent.start.lineno, 1) 1873 + self.assertEqual(b.source_extent.end.lineno, 1) 1874 + 1875 + self.assertEqual(c.source_extent.start.lineno, 2) 1876 + self.assertEqual(c.source_extent.end.lineno, 2) 1877 + 1878 + self.assertEqual(d.source_extent.start.lineno, 2) 1879 + self.assertEqual(d.source_extent.end.lineno, 2) 1880 + 1881 + def test_read_token_sets_source_extents_for_variables(self) -> None: 1882 + l = Lexer("aa bbbb \n ccccc ddddddd") 1883 + 1884 + a = l.read_token() 1885 + b = l.read_token() 1886 + c = l.read_token() 1887 + d = l.read_token() 1888 + 1889 + self.assertEqual(a.source_extent.start.lineno, 1) 1890 + self.assertEqual(a.source_extent.end.lineno, 1) 1891 + self.assertEqual(a.source_extent.start.colno, 1) 1892 + self.assertEqual(a.source_extent.end.colno, 2) 1893 + self.assertEqual(a.source_extent.start.byteno, 0) 1894 + self.assertEqual(a.source_extent.end.byteno, 1) 1895 + 1896 + self.assertEqual(b.source_extent.start.lineno, 1) 1897 + self.assertEqual(b.source_extent.end.lineno, 1) 1898 + self.assertEqual(b.source_extent.start.colno, 4) 1899 + self.assertEqual(b.source_extent.end.colno, 7) 1900 + self.assertEqual(b.source_extent.start.byteno, 3) 1901 + self.assertEqual(b.source_extent.end.byteno, 6) 1902 + 1903 + self.assertEqual(c.source_extent.start.lineno, 2) 1904 + self.assertEqual(c.source_extent.end.lineno, 2) 1905 + self.assertEqual(c.source_extent.start.colno, 2) 1906 + self.assertEqual(c.source_extent.end.colno, 6) 1907 + self.assertEqual(c.source_extent.start.byteno, 10) 1908 + self.assertEqual(c.source_extent.end.byteno, 14) 1909 + 1910 + self.assertEqual(d.source_extent.start.lineno, 2) 1911 + self.assertEqual(d.source_extent.end.lineno, 2) 1912 + self.assertEqual(d.source_extent.start.colno, 8) 1913 + self.assertEqual(d.source_extent.end.colno, 14) 1914 + self.assertEqual(d.source_extent.start.byteno, 16) 1915 + self.assertEqual(d.source_extent.end.byteno, 22) 1916 + 1917 + def test_read_token_correctly_sets_source_extents_for_variants(self) -> None: 1918 + l = Lexer("# \n\r\n\t abc") 1919 + 1920 + a = l.read_token() 1921 + b = l.read_token() 1922 + 1923 + self.assertEqual(a.source_extent.start.lineno, 1) 1924 + self.assertEqual(a.source_extent.end.lineno, 1) 1925 + self.assertEqual(a.source_extent.start.colno, 1) 1926 + # TODO(max): Should tabs count as one column? 1927 + self.assertEqual(a.source_extent.end.colno, 1) 1928 + 1929 + self.assertEqual(b.source_extent.start.lineno, 3) 1930 + self.assertEqual(b.source_extent.end.lineno, 3) 1931 + self.assertEqual(b.source_extent.start.colno, 3) 1932 + self.assertEqual(b.source_extent.end.colno, 5) 1933 + 1934 + def test_read_token_correctly_sets_source_extents_for_strings(self) -> None: 1935 + l = Lexer('"今日は、Maxさん。"') 1936 + a = l.read_token() 1937 + 1938 + self.assertEqual(a.source_extent.start.lineno, 1) 1939 + self.assertEqual(a.source_extent.end.lineno, 1) 1940 + 1941 + self.assertEqual(a.source_extent.start.colno, 1) 1942 + self.assertEqual(a.source_extent.end.colno, 12) 1943 + 1944 + self.assertEqual(a.source_extent.start.byteno, 0) 1945 + self.assertEqual(a.source_extent.end.byteno, 25) 1946 + 1947 + def test_read_token_correctly_sets_source_extents_for_byte_literals(self) -> None: 1948 + l = Lexer("~~QUJD ~~85'K|(_ ~~64'QUJD\n ~~32'IFBEG=== ~~16'414243") 1949 + a = l.read_token() 1950 + b = l.read_token() 1951 + c = l.read_token() 1952 + d = l.read_token() 1953 + e = l.read_token() 1954 + 1955 + self.assertEqual(a.source_extent.start.lineno, 1) 1956 + self.assertEqual(a.source_extent.end.lineno, 1) 1957 + self.assertEqual(a.source_extent.start.colno, 1) 1958 + self.assertEqual(a.source_extent.end.colno, 6) 1959 + self.assertEqual(a.source_extent.start.byteno, 0) 1960 + self.assertEqual(a.source_extent.end.byteno, 5) 1961 + 1962 + self.assertEqual(b.source_extent.start.lineno, 1) 1963 + self.assertEqual(b.source_extent.end.lineno, 1) 1964 + self.assertEqual(b.source_extent.start.colno, 8) 1965 + self.assertEqual(b.source_extent.end.colno, 16) 1966 + self.assertEqual(b.source_extent.start.byteno, 7) 1967 + self.assertEqual(b.source_extent.end.byteno, 15) 1968 + 1969 + self.assertEqual(c.source_extent.start.lineno, 1) 1970 + self.assertEqual(c.source_extent.end.lineno, 1) 1971 + self.assertEqual(c.source_extent.start.colno, 18) 1972 + self.assertEqual(c.source_extent.end.colno, 26) 1973 + self.assertEqual(c.source_extent.start.byteno, 17) 1974 + self.assertEqual(c.source_extent.end.byteno, 25) 1975 + 1976 + self.assertEqual(d.source_extent.start.lineno, 2) 1977 + self.assertEqual(d.source_extent.end.lineno, 2) 1978 + self.assertEqual(d.source_extent.start.colno, 2) 1979 + self.assertEqual(d.source_extent.end.colno, 14) 1980 + self.assertEqual(d.source_extent.start.byteno, 28) 1981 + self.assertEqual(d.source_extent.end.byteno, 40) 1982 + 1983 + self.assertEqual(e.source_extent.start.lineno, 2) 1984 + self.assertEqual(e.source_extent.end.lineno, 2) 1985 + self.assertEqual(e.source_extent.start.colno, 16) 1986 + self.assertEqual(e.source_extent.end.colno, 26) 1987 + self.assertEqual(e.source_extent.start.byteno, 42) 1988 + self.assertEqual(e.source_extent.end.byteno, 52) 1989 + 1990 + def test_read_token_correctly_sets_source_extents_for_numbers(self) -> None: 1991 + l = Lexer("123 123.456") 1992 + a = l.read_token() 1993 + b = l.read_token() 1994 + 1995 + self.assertEqual(a.source_extent.start.lineno, 1) 1996 + self.assertEqual(a.source_extent.end.lineno, 1) 1997 + self.assertEqual(a.source_extent.start.colno, 1) 1998 + self.assertEqual(a.source_extent.end.colno, 3) 1999 + self.assertEqual(a.source_extent.start.byteno, 0) 2000 + self.assertEqual(a.source_extent.end.byteno, 2) 2001 + 2002 + self.assertEqual(b.source_extent.start.lineno, 1) 2003 + self.assertEqual(b.source_extent.end.lineno, 1) 2004 + self.assertEqual(b.source_extent.start.colno, 5) 2005 + self.assertEqual(b.source_extent.end.colno, 11) 2006 + self.assertEqual(b.source_extent.start.byteno, 4) 2007 + self.assertEqual(b.source_extent.end.byteno, 10) 2008 + 2009 + def test_read_token_correctly_sets_source_extents_for_operators(self) -> None: 2010 + l = Lexer("> >>") 2011 + a = l.read_token() 2012 + b = l.read_token() 2013 + 2014 + self.assertEqual(a.source_extent.start.lineno, 1) 2015 + self.assertEqual(a.source_extent.end.lineno, 1) 2016 + self.assertEqual(a.source_extent.start.colno, 1) 2017 + self.assertEqual(a.source_extent.end.colno, 1) 2018 + self.assertEqual(a.source_extent.start.byteno, 0) 2019 + self.assertEqual(a.source_extent.end.byteno, 0) 2020 + 2021 + self.assertEqual(b.source_extent.start.lineno, 1) 2022 + self.assertEqual(b.source_extent.end.lineno, 1) 2023 + self.assertEqual(b.source_extent.start.colno, 3) 2024 + self.assertEqual(b.source_extent.end.colno, 4) 2025 + self.assertEqual(b.source_extent.start.byteno, 2) 2026 + self.assertEqual(b.source_extent.end.byteno, 3) 1762 2027 1763 2028 def test_tokenize_list_with_only_spread(self) -> None: 1764 2029 self.assertEqual(tokenize("[ ... ]"), [LeftBracket(), Operator("..."), RightBracket()]) ··· 1838 2103 ], 1839 2104 ) 1840 2105 1841 - def test_tokenize_variant_with_space(self) -> None: 1842 - self.assertEqual(tokenize("# abc"), [VariantToken("abc")]) 2106 + def test_tokenize_variant_with_whitespace(self) -> None: 2107 + self.assertEqual(tokenize("# \n\r\n\t abc"), [Hash(), Name("abc")]) 1843 2108 1844 2109 def test_tokenize_variant_with_no_space(self) -> None: 1845 - self.assertEqual(tokenize("#abc"), [VariantToken("abc")]) 1846 - 1847 - def test_tokenize_variant_non_name_raises_parse_error(self) -> None: 1848 - with self.assertRaisesRegex(ParseError, "expected name"): 1849 - tokenize("#1") 1850 - 1851 - def test_tokenize_variant_eof_raises_unexpected_eof_error(self) -> None: 1852 - with self.assertRaisesRegex(UnexpectedEOFError, "while reading symbol"): 1853 - tokenize("#") 2110 + self.assertEqual(tokenize("#abc"), [Hash(), Name("abc")]) 1854 2111 1855 2112 1856 2113 class ParserTests(unittest.TestCase): ··· 2007 2264 ) 2008 2265 2009 2266 def test_parse_list_with_only_comma_raises_parse_error(self) -> None: 2010 - with self.assertRaisesRegex(ParseError, re.escape("unexpected token Operator(lineno=-1, value=',')")): 2267 + with self.assertRaises(UnexpectedTokenError) as parse_error: 2011 2268 parse([LeftBracket(), Operator(","), RightBracket()]) 2012 2269 2270 + self.assertEqual(parse_error.exception.unexpected_token, Operator(",")) 2271 + 2013 2272 def test_parse_list_with_two_commas_raises_parse_error(self) -> None: 2014 - with self.assertRaisesRegex(ParseError, re.escape("unexpected token Operator(lineno=-1, value=',')")): 2273 + with self.assertRaises(UnexpectedTokenError) as parse_error: 2015 2274 parse([LeftBracket(), Operator(","), Operator(","), RightBracket()]) 2016 2275 2276 + self.assertEqual(parse_error.exception.unexpected_token, Operator(",")) 2277 + 2017 2278 def test_parse_list_with_trailing_comma_raises_parse_error(self) -> None: 2018 - with self.assertRaisesRegex(ParseError, re.escape("unexpected token RightBracket(lineno=-1)")): 2279 + with self.assertRaises(UnexpectedTokenError) as parse_error: 2019 2280 parse([LeftBracket(), IntLit(1), Operator(","), RightBracket()]) 2281 + 2282 + self.assertEqual(parse_error.exception.unexpected_token, RightBracket()) 2020 2283 2021 2284 def test_parse_assign(self) -> None: 2022 2285 self.assertEqual( ··· 2308 2571 ) 2309 2572 2310 2573 def test_parse_record_with_only_comma_raises_parse_error(self) -> None: 2311 - with self.assertRaisesRegex(ParseError, re.escape("unexpected token Operator(lineno=-1, value=',')")): 2574 + with self.assertRaises(UnexpectedTokenError) as parse_error: 2312 2575 parse([LeftBrace(), Operator(","), RightBrace()]) 2313 2576 2577 + self.assertEqual(parse_error.exception.unexpected_token, Operator(",")) 2578 + 2314 2579 def test_parse_record_with_two_commas_raises_parse_error(self) -> None: 2315 - with self.assertRaisesRegex(ParseError, re.escape("unexpected token Operator(lineno=-1, value=',')")): 2580 + with self.assertRaises(UnexpectedTokenError) as parse_error: 2316 2581 parse([LeftBrace(), Operator(","), Operator(","), RightBrace()]) 2317 2582 2583 + self.assertEqual(parse_error.exception.unexpected_token, Operator(",")) 2584 + 2318 2585 def test_parse_record_with_trailing_comma_raises_parse_error(self) -> None: 2319 - with self.assertRaisesRegex(ParseError, re.escape("unexpected token RightBrace(lineno=-1)")): 2586 + with self.assertRaises(UnexpectedTokenError) as parse_error: 2320 2587 parse([LeftBrace(), Name("x"), Operator("="), IntLit(1), Operator(","), RightBrace()]) 2588 + 2589 + self.assertEqual(parse_error.exception.unexpected_token, RightBrace()) 2321 2590 2322 2591 def test_parse_variant_returns_variant(self) -> None: 2323 - self.assertEqual(parse([VariantToken("abc"), IntLit(1)]), Variant("abc", Int(1))) 2592 + self.assertEqual(parse([Hash(), Name("abc"), IntLit(1)]), Variant("abc", Int(1))) 2593 + 2594 + def test_parse_variant_non_name_raises_parse_error(self) -> None: 2595 + with self.assertRaises(UnexpectedTokenError) as parse_error: 2596 + parse([Hash(), IntLit(1)]) 2597 + 2598 + self.assertEqual(parse_error.exception.unexpected_token, IntLit(1)) 2599 + 2600 + def test_parse_variant_eof_raises_unexpected_eof_error(self) -> None: 2601 + with self.assertRaises(UnexpectedEOFError): 2602 + parse([Hash()]) 2324 2603 2325 2604 def test_match_with_variant(self) -> None: 2326 2605 ast = parse(tokenize("| #true () -> 123"))