Coverage for trlc/lexer.py: 100%
323 statements
« prev ^ index » next coverage.py v7.10.7, created at 2026-06-18 05:43 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2026-06-18 05:43 +0000
1#!/usr/bin/env python3
2#
3# TRLC - Treat Requirements Like Code
4# Copyright (C) 2022-2023, 2026 Bayerische Motoren Werke Aktiengesellschaft (BMW AG)
5#
6# This file is part of the TRLC Python Reference Implementation.
7#
8# TRLC is free software: you can redistribute it and/or modify it
9# under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# TRLC is distributed in the hope that it will be useful, but WITHOUT
14# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
16# License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with TRLC. If not, see <https://www.gnu.org/licenses/>.
21from fractions import Fraction
22from abc import ABCMeta, abstractmethod
24from trlc.errors import Location, Message_Handler
27def triple_quoted_string_value(raw_value):
28 # lobster-trace: LRM.Complex_String_Value
29 assert isinstance(raw_value, str)
30 assert len(raw_value) >= 6
31 assert raw_value.startswith("'''") or raw_value.startswith('"""')
32 assert raw_value[:3] == raw_value[-3:]
34 lines = raw_value[3:-3].strip().splitlines()
35 if not lines:
36 return ""
38 non_empty_lines = [line for line in lines if line.strip()]
40 value = lines[0]
41 common_ws = ""
42 common_len = 0
43 if len(non_empty_lines) >= 2:
44 # The loop below cannot complete by construction
45 for c in non_empty_lines[1]: # pragma: no cover
46 if c not in (" \t"):
47 break
48 common_ws += c
49 common_len += 1
50 else:
51 return value
53 for line in lines[2:]:
54 if not line.strip():
55 continue
56 for idx in range(common_len):
57 if idx < len(line) and line[idx] == common_ws[idx]:
58 pass
59 else:
60 common_len = idx
61 break
63 for line in lines[1:]:
64 value += "\n" + line[common_len:].rstrip()
66 return value
69class Source_Reference(Location):
70 def __init__(self, lexer, start_line, start_col, start_pos, end_pos):
71 assert isinstance(lexer, TRLC_Lexer)
72 assert isinstance(start_line, int)
73 assert isinstance(start_col, int)
74 assert isinstance(start_pos, int)
75 assert isinstance(end_pos, int)
76 assert 0 <= start_pos <= end_pos < lexer.length
77 super().__init__(lexer.file_name,
78 start_line,
79 start_col)
80 self.lexer = lexer
81 self.start_pos = start_pos
82 self.end_pos = end_pos
84 def text(self):
85 return self.lexer.content[self.start_pos:self.end_pos + 1]
87 def context_lines(self):
88 line = ""
89 n = self.start_pos
90 while n >= 0:
91 if self.lexer.content[n] == "\n":
92 break
93 line = self.lexer.content[n] + line
94 n -= 1
95 offset = self.start_pos - n - 1
96 n = self.start_pos + 1
97 while n < self.lexer.length:
98 if self.lexer.content[n] == "\n":
99 break
100 line = line + self.lexer.content[n]
101 n += 1
102 maxtrail = n - self.start_pos
103 tlen = self.end_pos + 1 - self.start_pos
105 stripped_line = line.lstrip()
106 stripped_offset = offset - (len(line) - len(stripped_line))
108 return [stripped_line,
109 " " * stripped_offset + "^" * min(tlen, maxtrail)]
111 def get_end_location(self):
112 lines_in_between = self.lexer.content[
113 self.start_pos : self.end_pos + 1
114 ].count("\n")
115 end_line = self.line_no + lines_in_between
117 end_col = self.end_pos + 1
118 for n in range(self.end_pos, 1, -1):
119 if self.lexer.content[n] == "\n":
120 end_col = max(self.end_pos - n, 1)
121 break
123 return Location(self.file_name, end_line, end_col)
126class Token_Base:
127 def __init__(self, location, kind, value):
128 assert isinstance(location, Location)
129 assert isinstance(kind, str)
130 self.location = location
131 self.kind = kind
132 self.value = value
135class Token(Token_Base):
136 KIND = {
137 "COMMENT" : "comment",
138 "IDENTIFIER" : "identifier",
139 "KEYWORD" : "keyword",
140 "BRA" : "opening parenthesis '('",
141 "KET" : "closing parenthesis ')'",
142 "S_BRA" : "opening bracket '['",
143 "S_KET" : "closing bracket ']'",
144 "C_BRA" : "opening brace '{'",
145 "C_KET" : "closing brace '}'",
146 "COMMA" : "comma ','",
147 "AT" : "separtor '@'",
148 "SEMICOLON" : "separator ';'",
149 "COLON" : "separator ':'",
150 "DOT" : ".",
151 "RANGE" : "..",
152 "ASSIGN" : "=",
153 "OPERATOR" : "operator",
154 "ARROW" : "->",
155 "INTEGER" : "integer literal",
156 "DECIMAL" : "decimal literal",
157 "STRING" : "string literal",
158 }
160 def __init__(self, location, kind, value=None, ast_link=None):
161 assert kind in Token.KIND
162 if kind in ("COMMENT", "IDENTIFIER",
163 "KEYWORD", "OPERATOR", "STRING"):
164 assert isinstance(value, str)
165 elif kind == "INTEGER":
166 assert isinstance(value, int)
167 elif kind == "DECIMAL":
168 assert isinstance(value, Fraction)
169 else:
170 assert value is None
171 super().__init__(location, kind, value)
172 self.ast_link = ast_link
174 def __repr__(self):
175 if self.value is None:
176 return "%s_Token" % self.kind
177 else:
178 return "%s_Token(%s)" % (self.kind, self.value)
181class Lexer_Base(metaclass=ABCMeta):
182 def __init__(self, mh, content):
183 assert isinstance(mh, Message_Handler)
184 assert isinstance(content, str)
185 self.mh = mh
186 self.content = content
187 self.length = len(self.content)
188 self.tokens = []
190 self.lexpos = -3
191 self.line_no = 0
192 self.col_no = 0
193 self.cc = None
194 self.nc = None
195 self.nnc = None
197 self.advance()
198 self.advance()
200 @staticmethod
201 def is_alpha(char):
202 # lobster-trace: LRM.Identifier
203 return char.isascii() and char.isalpha()
205 @staticmethod
206 def is_numeric(char):
207 # lobster-trace: LRM.Integers
208 # lobster-trace: LRM.Decimals
209 return char.isascii() and char.isdigit()
211 @staticmethod
212 def is_alnum(char):
213 # lobster-trace: LRM.Identifier
214 return char.isascii() and char.isalnum()
216 @abstractmethod
217 def file_location(self):
218 pass
220 @abstractmethod
221 def token(self):
222 pass
224 def skip_whitespace(self):
225 # lobster-trace: LRM.Whitespace
226 while self.nc and self.nc.isspace():
227 self.advance()
228 self.advance()
230 def advance(self):
231 self.lexpos += 1
232 if self.cc == "\n" or self.lexpos == 0:
233 self.line_no += 1
234 self.col_no = 0
235 if self.nc is not None:
236 self.col_no += 1
237 self.cc = self.nc
238 self.nc = self.nnc
239 self.nnc = (self.content[self.lexpos + 2]
240 if self.lexpos + 2 < self.length
241 else None)
244class TRLC_Lexer(Lexer_Base):
245 KEYWORDS = frozenset([
246 "abs",
247 "abstract",
248 "and",
249 "checks",
250 "else",
251 "elsif",
252 "enum",
253 "error",
254 "exists",
255 "extends",
256 "false",
257 "fatal",
258 "final",
259 "forall",
260 "freeze",
261 "if",
262 "implies",
263 "import",
264 "in",
265 "not",
266 "null",
267 "optional",
268 "or",
269 "package",
270 "section",
271 "separator",
272 "then",
273 "true",
274 "tuple",
275 "type",
276 "warning",
277 "xor"
278 ])
280 PUNCTUATION = {
281 "(" : "BRA",
282 ")" : "KET",
283 "{" : "C_BRA",
284 "}" : "C_KET",
285 "[" : "S_BRA",
286 "]" : "S_KET",
287 "," : "COMMA",
288 "@" : "AT",
289 ":" : "COLON",
290 ";" : "SEMICOLON",
291 "/" : "OPERATOR",
292 "%" : "OPERATOR",
293 "+" : "OPERATOR",
294 "-" : "OPERATOR",
295 }
297 def __init__(self, mh, file_name, file_content=None):
298 assert isinstance(file_name, str)
299 assert isinstance(file_content, str) or file_content is None
300 self.file_name = file_name
301 if file_content is None:
302 # lobster-trace: LRM.File_Encoding
303 # lobster-trace: LRM.File_Encoding_Fixed
304 with open(file_name, "r", encoding="UTF-8") as fd:
305 try:
306 super().__init__(mh, fd.read())
307 except UnicodeDecodeError as err:
308 mh.lex_error(Location(file_name), str(err))
309 else:
310 super().__init__(mh, file_content)
312 def current_location(self):
313 # lobster-exclude: Utility function
314 return Source_Reference(lexer = self,
315 start_line = self.line_no,
316 start_col = self.col_no,
317 start_pos = self.lexpos,
318 end_pos = self.lexpos)
320 def file_location(self):
321 # lobster-exclude: Utility function
322 return Location(self.file_name, 1, 1)
324 def token(self):
325 # Skip whitespace and move to the next char
326 self.skip_whitespace()
328 # Return if we're done
329 if self.cc is None:
330 return None
332 start_pos = self.lexpos
333 start_line = self.line_no
334 start_col = self.col_no
336 if self.cc == "/" and self.nc == "/":
337 # lobster-trace: LRM.Comments
338 kind = "COMMENT"
339 while self.cc and self.nc != "\n":
340 self.advance()
342 elif self.cc == "/" and self.nc == "*":
343 # lobster-trace: LRM.Comments
344 kind = "COMMENT"
345 while self.nc and not (self.cc == "*" and self.nc == "/"):
346 self.advance()
347 self.advance()
349 elif self.is_alpha(self.cc):
350 # lobster-trace: LRM.Identifier
351 kind = "IDENTIFIER"
352 while self.nc and (self.is_alnum(self.nc) or
353 self.nc == "_"):
354 self.advance()
356 elif self.cc in TRLC_Lexer.PUNCTUATION:
357 # lobster-trace: LRM.Single_Delimiters
358 kind = TRLC_Lexer.PUNCTUATION[self.cc]
360 elif self.cc == "=":
361 # lobster-trace: LRM.Single_Delimiters
362 # lobster-trace: LRM.Double_Delimiters
363 # lobster-trace: LRM.Lexing_Disambiguation
364 if self.nc == ">":
365 kind = "ARROW"
366 self.advance()
367 elif self.nc == "=":
368 kind = "OPERATOR"
369 self.advance()
370 else:
371 kind = "ASSIGN"
373 elif self.cc == ".":
374 # lobster-trace: LRM.Single_Delimiters
375 # lobster-trace: LRM.Double_Delimiters
376 # lobster-trace: LRM.Lexing_Disambiguation
377 if self.nc == ".":
378 kind = "RANGE"
379 self.advance()
380 else:
381 kind = "DOT"
383 elif self.cc in ("<", ">"):
384 # lobster-trace: LRM.Single_Delimiters
385 # lobster-trace: LRM.Double_Delimiters
386 # lobster-trace: LRM.Lexing_Disambiguation
387 kind = "OPERATOR"
388 if self.nc == "=":
389 self.advance()
391 elif self.cc == "!":
392 # lobster-trace: LRM.Double_Delimiters
393 # lobster-trace: LRM.Lexing_Disambiguation
394 kind = "OPERATOR"
395 if self.nc == "=":
396 self.advance()
397 else:
398 self.mh.lex_error(self.current_location(),
399 "malformed != operator")
401 elif self.cc == "*":
402 # lobster-trace: LRM.Single_Delimiters
403 # lobster-trace: LRM.Double_Delimiters
404 # lobster-trace: LRM.Lexing_Disambiguation
405 kind = "OPERATOR"
406 if self.nc == "*":
407 self.advance()
409 elif self.cc == '"':
410 # lobster-trace: LRM.Strings
411 kind = "STRING"
412 if self.nc == '"' and self.nnc == '"':
413 self.advance()
414 self.advance()
415 quotes_seen = 0
416 while quotes_seen < 3:
417 self.advance()
418 if self.cc == '"':
419 quotes_seen += 1
420 else:
421 quotes_seen = 0
422 if self.nc is None:
423 self.mh.lex_error(
424 Source_Reference(lexer = self,
425 start_line = start_line,
426 start_col = start_col,
427 start_pos = start_pos,
428 end_pos = self.lexpos),
429 "unterminated triple-quoted string")
430 else:
431 while self.nc != '"':
432 if self.nc is None:
433 self.mh.lex_error(
434 Source_Reference(lexer = self,
435 start_line = start_line,
436 start_col = start_col,
437 start_pos = start_pos,
438 end_pos = self.lexpos),
439 "unterminated string")
440 elif self.nc == "\n":
441 self.mh.lex_error(
442 Source_Reference(lexer = self,
443 start_line = start_line,
444 start_col = start_col,
445 start_pos = start_pos,
446 end_pos = self.lexpos),
447 "double quoted strings cannot include newlines")
449 self.advance()
450 if self.cc == "\\" and self.nc == '"':
451 self.advance()
452 self.advance()
454 elif self.cc == "'":
455 # lobster-trace: LRM.Strings
456 kind = "STRING"
457 for _ in range(2):
458 self.advance()
459 if self.cc != "'":
460 self.mh.lex_error(
461 Source_Reference(lexer = self,
462 start_line = start_line,
463 start_col = start_col,
464 start_pos = start_pos,
465 end_pos = self.lexpos),
466 "malformed triple-quoted string")
467 quotes_seen = 0
468 while quotes_seen < 3:
469 self.advance()
470 if self.cc == "'":
471 quotes_seen += 1
472 else:
473 quotes_seen = 0
474 if self.nc is None:
475 self.mh.lex_error(
476 Source_Reference(lexer = self,
477 start_line = start_line,
478 start_col = start_col,
479 start_pos = start_pos,
480 end_pos = self.lexpos),
481 "unterminated triple-quoted string")
483 elif self.is_numeric(self.cc):
484 # lobster-trace: LRM.Integers
485 # lobster-trace: LRM.Decimals
486 kind = "INTEGER"
488 if self.cc == "0" and self.nc == "b":
489 digits_allowed = "01"
490 digits_forbidden = "23456789abcdefABCDEF"
491 int_base = 2
492 require_digit = True
493 decimal_allowed = False
494 self.advance()
495 elif self.cc == "0" and self.nc == "x":
496 digits_allowed = "0123456789abcdefABCDEF"
497 digits_forbidden = ""
498 int_base = 16
499 require_digit = True
500 decimal_allowed = False
501 self.advance()
502 else:
503 digits_allowed = "0123456789"
504 digits_forbidden = "abcdefABCDEF"
505 int_base = 10
506 require_digit = False
507 decimal_allowed = True
509 while self.nc:
510 if self.nc in digits_allowed:
511 self.advance()
512 require_digit = False
514 elif self.nc in digits_forbidden:
515 self.mh.lex_error(
516 Source_Reference(lexer = self,
517 start_line = start_line,
518 start_col = start_col,
519 start_pos = self.lexpos + 1,
520 end_pos = self.lexpos + 1),
521 "%s is not a valid base %u digit" % (self.nc,
522 int_base))
524 elif require_digit:
525 self.mh.lex_error(
526 Source_Reference(lexer = self,
527 start_line = start_line,
528 start_col = start_col,
529 start_pos = self.lexpos + 1,
530 end_pos = self.lexpos + 1),
531 "base %u digit is required here" % int_base)
533 elif self.nc == "_":
534 self.advance()
535 require_digit = True
537 elif self.nc == "." and self.nnc == ".":
538 # This is a range token, so that one can't be part
539 # of our number anymore
540 break
542 elif self.nc == ".":
543 self.advance()
544 if not decimal_allowed:
545 if int_base == 10:
546 msg = "decimal point is not allowed here"
547 else:
548 msg = ("base %u integer may not contain a"
549 " decimal point" % int_base)
550 self.mh.lex_error(
551 Source_Reference(lexer = self,
552 start_line = start_line,
553 start_col = start_col,
554 start_pos = self.lexpos,
555 end_pos = self.lexpos),
556 msg)
557 decimal_allowed = False
558 require_digit = True
559 kind = "DECIMAL"
561 else: # pragma: no cover
562 # This is actually a false
563 # alarm, this line is covered (it's the only
564 # normal way to exit this loop.
565 break
567 if require_digit:
568 self.mh.lex_error(
569 Source_Reference(lexer = self,
570 start_line = start_line,
571 start_col = start_col,
572 start_pos = start_pos,
573 end_pos = self.lexpos),
574 "unfinished base %u integer" % int_base)
576 else:
577 self.mh.lex_error(self.current_location(),
578 "unexpected character '%s'" % self.cc)
580 sref = Source_Reference(lexer = self,
581 start_line = start_line,
582 start_col = start_col,
583 start_pos = start_pos,
584 end_pos = min(self.lexpos, self.length - 1))
586 if kind == "IDENTIFIER":
587 value = sref.text()
588 if value in TRLC_Lexer.KEYWORDS:
589 # lobster-trace: LRM.TRLC_Keywords
590 kind = "KEYWORD"
592 elif kind == "OPERATOR":
593 value = sref.text()
595 elif kind == "STRING":
596 value = sref.text()
597 if value.startswith('"""'):
598 value = triple_quoted_string_value(value)
599 elif value.startswith('"'):
600 # lobster-trace: LRM.Simple_String_Value
601 value = value[1:-1]
602 value = value.replace('\\"', '"')
603 else:
604 value = triple_quoted_string_value(value)
606 elif kind == "INTEGER":
607 # lobster-trace: LRM.Integer_Values
608 base_text = sref.text().replace("_", "")
609 if int_base == 10:
610 value = int(base_text)
611 elif int_base == 2:
612 value = int(base_text[2:], base=2)
613 else:
614 value = int(base_text[2:], base=16)
616 elif kind == "DECIMAL":
617 # lobster-trace: LRM.Decimal_Values
618 value = Fraction(sref.text().replace("_", ""))
620 elif kind == "COMMENT":
621 value = sref.text()
622 if value.startswith("//"):
623 value = value[2:].strip()
624 else:
625 value = value[2:]
626 if value.endswith("*/"):
627 value = value[:-2]
628 value = value.strip()
630 else:
631 value = None
633 return Token(sref, kind, value)
636class Token_Stream(TRLC_Lexer):
638 def token(self):
639 tok = super().token()
640 if tok is not None:
641 self.tokens.append(tok)
642 return tok