Coverage for trlc/lexer.py: 100%
324 statements
« prev ^ index » next coverage.py v7.7.1, created at 2025-03-27 00:52 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2025-03-27 00:52 +0000
1#!/usr/bin/env python3
2#
3# TRLC - Treat Requirements Like Code
4# Copyright (C) 2022-2023 Bayerische Motoren Werke Aktiengesellschaft (BMW AG)
5#
6# This file is part of the TRLC Python Reference Implementation.
7#
8# TRLC is free software: you can redistribute it and/or modify it
9# under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# TRLC is distributed in the hope that it will be useful, but WITHOUT
14# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
16# License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with TRLC. If not, see <https://www.gnu.org/licenses/>.
21import sys
22from fractions import Fraction
23from abc import ABCMeta, abstractmethod
25from trlc.errors import Location, Message_Handler
28def triple_quoted_string_value(raw_value):
29 # lobster-trace: LRM.Complex_String_Value
30 assert isinstance(raw_value, str)
31 assert len(raw_value) >= 6
32 assert raw_value.startswith("'''") or raw_value.startswith('"""')
33 assert raw_value[:3] == raw_value[-3:]
35 lines = raw_value[3:-3].strip().splitlines()
36 if not lines:
37 return ""
39 non_empty_lines = [line for line in lines if line.strip()]
41 value = lines[0]
42 common_ws = ""
43 common_len = 0
44 if len(non_empty_lines) >= 2:
45 # The loop below cannot complete by construction
46 for c in non_empty_lines[1]: # pragma: no cover
47 if c not in (" \t"):
48 break
49 common_ws += c
50 common_len += 1
51 else:
52 return value
54 for line in lines[2:]:
55 if not line.strip():
56 continue
57 for idx in range(common_len):
58 if idx < len(line) and line[idx] == common_ws[idx]:
59 pass
60 else:
61 common_len = idx
62 break
64 for line in lines[1:]:
65 value += "\n" + line[common_len:].rstrip()
67 return value
70class Source_Reference(Location):
71 def __init__(self, lexer, start_line, start_col, start_pos, end_pos):
72 assert isinstance(lexer, TRLC_Lexer)
73 assert isinstance(start_line, int)
74 assert isinstance(start_col, int)
75 assert isinstance(start_pos, int)
76 assert isinstance(end_pos, int)
77 assert 0 <= start_pos <= end_pos < lexer.length
78 super().__init__(lexer.file_name,
79 start_line,
80 start_col)
81 self.lexer = lexer
82 self.start_pos = start_pos
83 self.end_pos = end_pos
85 def text(self):
86 return self.lexer.content[self.start_pos:self.end_pos + 1]
88 def context_lines(self):
89 line = ""
90 n = self.start_pos
91 while n >= 0:
92 if self.lexer.content[n] == "\n":
93 break
94 line = self.lexer.content[n] + line
95 n -= 1
96 offset = self.start_pos - n - 1
97 n = self.start_pos + 1
98 while n < self.lexer.length:
99 if self.lexer.content[n] == "\n":
100 break
101 line = line + self.lexer.content[n]
102 n += 1
103 maxtrail = n - self.start_pos
104 tlen = self.end_pos + 1 - self.start_pos
106 stripped_line = line.lstrip()
107 stripped_offset = offset - (len(line) - len(stripped_line))
109 return [stripped_line,
110 " " * stripped_offset + "^" * min(tlen, maxtrail)]
112 def get_end_location(self):
113 lines_in_between = self.lexer.content[
114 self.start_pos : self.end_pos + 1
115 ].count("\n")
116 end_line = self.line_no + lines_in_between
118 end_col = self.end_pos + 1
119 for n in range(self.end_pos, 1, -1):
120 if self.lexer.content[n] == "\n":
121 end_col = max(self.end_pos - n, 1)
122 break
124 return Location(self.file_name, end_line, end_col)
127class Token_Base:
128 def __init__(self, location, kind, value):
129 assert isinstance(location, Location)
130 assert isinstance(kind, str)
131 self.location = location
132 self.kind = kind
133 self.value = value
136class Token(Token_Base):
137 KIND = {
138 "COMMENT" : "comment",
139 "IDENTIFIER" : "identifier",
140 "KEYWORD" : "keyword",
141 "BRA" : "opening parenthesis '('",
142 "KET" : "closing parenthesis ')'",
143 "S_BRA" : "opening bracket '['",
144 "S_KET" : "closing bracket ']'",
145 "C_BRA" : "opening brace '{'",
146 "C_KET" : "closing brace '}'",
147 "COMMA" : "comma ','",
148 "AT" : "separtor '@'",
149 "SEMICOLON" : "separator ';'",
150 "COLON" : "separator ':'",
151 "DOT" : ".",
152 "RANGE" : "..",
153 "ASSIGN" : "=",
154 "OPERATOR" : "operator",
155 "ARROW" : "->",
156 "INTEGER" : "integer literal",
157 "DECIMAL" : "decimal literal",
158 "STRING" : "string literal",
159 }
161 def __init__(self, location, kind, value=None, ast_link=None):
162 assert kind in Token.KIND
163 if kind in ("COMMENT", "IDENTIFIER",
164 "KEYWORD", "OPERATOR", "STRING"):
165 assert isinstance(value, str)
166 elif kind == "INTEGER":
167 assert isinstance(value, int)
168 elif kind == "DECIMAL":
169 assert isinstance(value, Fraction)
170 else:
171 assert value is None
172 super().__init__(location, kind, value)
173 self.ast_link = ast_link
175 def __repr__(self):
176 if self.value is None:
177 return "%s_Token" % self.kind
178 else:
179 return "%s_Token(%s)" % (self.kind, self.value)
182class Lexer_Base(metaclass=ABCMeta):
183 def __init__(self, mh, content):
184 assert isinstance(mh, Message_Handler)
185 assert isinstance(content, str)
186 self.mh = mh
187 self.content = content
188 self.length = len(self.content)
189 self.tokens = []
191 self.lexpos = -3
192 self.line_no = 0
193 self.col_no = 0
194 self.cc = None
195 self.nc = None
196 self.nnc = None
198 self.advance()
199 self.advance()
201 @staticmethod
202 def is_alpha(char):
203 # lobster-trace: LRM.Identifier
204 return char.isascii() and char.isalpha()
206 @staticmethod
207 def is_numeric(char):
208 # lobster-trace: LRM.Integers
209 # lobster-trace: LRM.Decimals
210 return char.isascii() and char.isdigit()
212 @staticmethod
213 def is_alnum(char):
214 # lobster-trace: LRM.Identifier
215 return char.isascii() and char.isalnum()
217 @abstractmethod
218 def file_location(self):
219 pass
221 @abstractmethod
222 def token(self):
223 pass
225 def skip_whitespace(self):
226 # lobster-trace: LRM.Whitespace
227 while self.nc and self.nc.isspace():
228 self.advance()
229 self.advance()
231 def advance(self):
232 self.lexpos += 1
233 if self.cc == "\n" or self.lexpos == 0:
234 self.line_no += 1
235 self.col_no = 0
236 if self.nc is not None:
237 self.col_no += 1
238 self.cc = self.nc
239 self.nc = self.nnc
240 self.nnc = (self.content[self.lexpos + 2]
241 if self.lexpos + 2 < self.length
242 else None)
245class TRLC_Lexer(Lexer_Base):
246 KEYWORDS = frozenset([
247 "abs",
248 "abstract",
249 "and",
250 "checks",
251 "else",
252 "elsif",
253 "enum",
254 "error",
255 "exists",
256 "extends",
257 "false",
258 "fatal",
259 "final",
260 "forall",
261 "freeze",
262 "if",
263 "implies",
264 "import",
265 "in",
266 "not",
267 "null",
268 "optional",
269 "or",
270 "package",
271 "section",
272 "separator",
273 "then",
274 "true",
275 "tuple",
276 "type",
277 "warning",
278 "xor"
279 ])
281 PUNCTUATION = {
282 "(" : "BRA",
283 ")" : "KET",
284 "{" : "C_BRA",
285 "}" : "C_KET",
286 "[" : "S_BRA",
287 "]" : "S_KET",
288 "," : "COMMA",
289 "@" : "AT",
290 ":" : "COLON",
291 ";" : "SEMICOLON",
292 "/" : "OPERATOR",
293 "%" : "OPERATOR",
294 "+" : "OPERATOR",
295 "-" : "OPERATOR",
296 }
298 def __init__(self, mh, file_name, file_content=None):
299 assert isinstance(file_name, str)
300 assert isinstance(file_content, str) or file_content is None
301 self.file_name = file_name
302 if file_content is None:
303 # lobster-trace: LRM.File_Encoding
304 # lobster-trace: LRM.File_Encoding_Fixed
305 with open(file_name, "r", encoding="UTF-8") as fd:
306 try:
307 super().__init__(mh, fd.read())
308 except UnicodeDecodeError as err:
309 mh.lex_error(Location(file_name), str(err))
310 else:
311 super().__init__(mh, file_content)
313 def current_location(self):
314 # lobster-exclude: Utility function
315 return Source_Reference(lexer = self,
316 start_line = self.line_no,
317 start_col = self.col_no,
318 start_pos = self.lexpos,
319 end_pos = self.lexpos)
321 def file_location(self):
322 # lobster-exclude: Utility function
323 return Location(self.file_name, 1, 1)
325 def token(self):
326 # Skip whitespace and move to the next char
327 self.skip_whitespace()
329 # Return if we're done
330 if self.cc is None:
331 return None
333 start_pos = self.lexpos
334 start_line = self.line_no
335 start_col = self.col_no
337 if self.cc == "/" and self.nc == "/":
338 # lobster-trace: LRM.Comments
339 kind = "COMMENT"
340 while self.cc and self.nc != "\n":
341 self.advance()
343 elif self.cc == "/" and self.nc == "*":
344 # lobster-trace: LRM.Comments
345 kind = "COMMENT"
346 while self.nc and not (self.cc == "*" and self.nc == "/"):
347 self.advance()
348 self.advance()
350 elif self.is_alpha(self.cc):
351 # lobster-trace: LRM.Identifier
352 kind = "IDENTIFIER"
353 while self.nc and (self.is_alnum(self.nc) or
354 self.nc == "_"):
355 self.advance()
357 elif self.cc in TRLC_Lexer.PUNCTUATION:
358 # lobster-trace: LRM.Single_Delimiters
359 kind = TRLC_Lexer.PUNCTUATION[self.cc]
361 elif self.cc == "=":
362 # lobster-trace: LRM.Single_Delimiters
363 # lobster-trace: LRM.Double_Delimiters
364 # lobster-trace: LRM.Lexing_Disambiguation
365 if self.nc == ">":
366 kind = "ARROW"
367 self.advance()
368 elif self.nc == "=":
369 kind = "OPERATOR"
370 self.advance()
371 else:
372 kind = "ASSIGN"
374 elif self.cc == ".":
375 # lobster-trace: LRM.Single_Delimiters
376 # lobster-trace: LRM.Double_Delimiters
377 # lobster-trace: LRM.Lexing_Disambiguation
378 if self.nc == ".":
379 kind = "RANGE"
380 self.advance()
381 else:
382 kind = "DOT"
384 elif self.cc in ("<", ">"):
385 # lobster-trace: LRM.Single_Delimiters
386 # lobster-trace: LRM.Double_Delimiters
387 # lobster-trace: LRM.Lexing_Disambiguation
388 kind = "OPERATOR"
389 if self.nc == "=":
390 self.advance()
392 elif self.cc == "!":
393 # lobster-trace: LRM.Double_Delimiters
394 # lobster-trace: LRM.Lexing_Disambiguation
395 kind = "OPERATOR"
396 if self.nc == "=":
397 self.advance()
398 else:
399 self.mh.lex_error(self.current_location(),
400 "malformed != operator")
402 elif self.cc == "*":
403 # lobster-trace: LRM.Single_Delimiters
404 # lobster-trace: LRM.Double_Delimiters
405 # lobster-trace: LRM.Lexing_Disambiguation
406 kind = "OPERATOR"
407 if self.nc == "*":
408 self.advance()
410 elif self.cc == '"':
411 # lobster-trace: LRM.Strings
412 kind = "STRING"
413 if self.nc == '"' and self.nnc == '"':
414 self.advance()
415 self.advance()
416 quotes_seen = 0
417 while quotes_seen < 3:
418 self.advance()
419 if self.cc == '"':
420 quotes_seen += 1
421 else:
422 quotes_seen = 0
423 if self.nc is None:
424 self.mh.lex_error(
425 Source_Reference(lexer = self,
426 start_line = start_line,
427 start_col = start_col,
428 start_pos = start_pos,
429 end_pos = self.lexpos),
430 "unterminated triple-quoted string")
431 else:
432 while self.nc != '"':
433 if self.nc is None:
434 self.mh.lex_error(
435 Source_Reference(lexer = self,
436 start_line = start_line,
437 start_col = start_col,
438 start_pos = start_pos,
439 end_pos = self.lexpos),
440 "unterminated string")
441 elif self.nc == "\n":
442 self.mh.lex_error(
443 Source_Reference(lexer = self,
444 start_line = start_line,
445 start_col = start_col,
446 start_pos = start_pos,
447 end_pos = self.lexpos),
448 "double quoted strings cannot include newlines")
450 self.advance()
451 if self.cc == "\\" and self.nc == '"':
452 self.advance()
453 self.advance()
455 elif self.cc == "'":
456 # lobster-trace: LRM.Strings
457 kind = "STRING"
458 for _ in range(2):
459 self.advance()
460 if self.cc != "'":
461 self.mh.lex_error(
462 Source_Reference(lexer = self,
463 start_line = start_line,
464 start_col = start_col,
465 start_pos = start_pos,
466 end_pos = self.lexpos),
467 "malformed triple-quoted string")
468 quotes_seen = 0
469 while quotes_seen < 3:
470 self.advance()
471 if self.cc == "'":
472 quotes_seen += 1
473 else:
474 quotes_seen = 0
475 if self.nc is None:
476 self.mh.lex_error(
477 Source_Reference(lexer = self,
478 start_line = start_line,
479 start_col = start_col,
480 start_pos = start_pos,
481 end_pos = self.lexpos),
482 "unterminated triple-quoted string")
484 elif self.is_numeric(self.cc):
485 # lobster-trace: LRM.Integers
486 # lobster-trace: LRM.Decimals
487 kind = "INTEGER"
489 if self.cc == "0" and self.nc == "b":
490 digits_allowed = "01"
491 digits_forbidden = "23456789abcdefABCDEF"
492 int_base = 2
493 require_digit = True
494 decimal_allowed = False
495 self.advance()
496 elif self.cc == "0" and self.nc == "x":
497 digits_allowed = "0123456789abcdefABCDEF"
498 digits_forbidden = ""
499 int_base = 16
500 require_digit = True
501 decimal_allowed = False
502 self.advance()
503 else:
504 digits_allowed = "0123456789"
505 digits_forbidden = "abcdefABCDEF"
506 int_base = 10
507 require_digit = False
508 decimal_allowed = True
510 while self.nc:
511 if self.nc in digits_allowed:
512 self.advance()
513 require_digit = False
515 elif self.nc in digits_forbidden:
516 self.mh.lex_error(
517 Source_Reference(lexer = self,
518 start_line = start_line,
519 start_col = start_col,
520 start_pos = self.lexpos + 1,
521 end_pos = self.lexpos + 1),
522 "%s is not a valid base %u digit" % (self.nc,
523 int_base))
525 elif require_digit:
526 self.mh.lex_error(
527 Source_Reference(lexer = self,
528 start_line = start_line,
529 start_col = start_col,
530 start_pos = self.lexpos + 1,
531 end_pos = self.lexpos + 1),
532 "base %u digit is required here" % int_base)
534 elif self.nc == "_":
535 self.advance()
536 require_digit = True
538 elif self.nc == "." and self.nnc == ".":
539 # This is a range token, so that one can't be part
540 # of our number anymore
541 break
543 elif self.nc == ".":
544 self.advance()
545 if not decimal_allowed:
546 if int_base == 10:
547 msg = "decimal point is not allowed here"
548 else:
549 msg = ("base %u integer may not contain a"
550 " decimal point" % int_base)
551 self.mh.lex_error(
552 Source_Reference(lexer = self,
553 start_line = start_line,
554 start_col = start_col,
555 start_pos = self.lexpos,
556 end_pos = self.lexpos),
557 msg)
558 decimal_allowed = False
559 require_digit = True
560 kind = "DECIMAL"
562 else: # pragma: no cover
563 # This is actually a false
564 # alarm, this line is covered (it's the only
565 # normal way to exit this loop.
566 break
568 if require_digit:
569 self.mh.lex_error(
570 Source_Reference(lexer = self,
571 start_line = start_line,
572 start_col = start_col,
573 start_pos = start_pos,
574 end_pos = self.lexpos),
575 "unfinished base %u integer" % int_base)
577 else:
578 self.mh.lex_error(self.current_location(),
579 "unexpected character '%s'" % self.cc)
581 sref = Source_Reference(lexer = self,
582 start_line = start_line,
583 start_col = start_col,
584 start_pos = start_pos,
585 end_pos = min(self.lexpos, self.length - 1))
587 if kind == "IDENTIFIER":
588 value = sref.text()
589 if value in TRLC_Lexer.KEYWORDS:
590 # lobster-trace: LRM.TRLC_Keywords
591 kind = "KEYWORD"
593 elif kind == "OPERATOR":
594 value = sref.text()
596 elif kind == "STRING":
597 value = sref.text()
598 if value.startswith('"""'):
599 value = triple_quoted_string_value(value)
600 elif value.startswith('"'):
601 # lobster-trace: LRM.Simple_String_Value
602 value = value[1:-1]
603 value = value.replace('\\"', '"')
604 else:
605 value = triple_quoted_string_value(value)
607 elif kind == "INTEGER":
608 # lobster-trace: LRM.Integer_Values
609 base_text = sref.text().replace("_", "")
610 if int_base == 10:
611 value = int(base_text)
612 elif int_base == 2:
613 value = int(base_text[2:], base=2)
614 else:
615 value = int(base_text[2:], base=16)
617 elif kind == "DECIMAL":
618 # lobster-trace: LRM.Decimal_Values
619 value = Fraction(sref.text().replace("_", ""))
621 elif kind == "COMMENT":
622 value = sref.text()
623 if value.startswith("//"):
624 value = value[2:].strip()
625 else:
626 value = value[2:]
627 if value.endswith("*/"):
628 value = value[:-2]
629 value = value.strip()
631 else:
632 value = None
634 return Token(sref, kind, value)
637class Token_Stream(TRLC_Lexer):
639 def token(self):
640 tok = super().token()
641 if tok is not None:
642 self.tokens.append(tok)
643 return tok
646def sanity_test():
647 # lobster-exclude: Developer test function
648 mh = Message_Handler()
649 lexer = TRLC_Lexer(mh, sys.argv[1])
651 while True:
652 token = lexer.token()
653 if token is None:
654 break
655 mh.warning(token.location,
656 str(token))
659if __name__ == "__main__":
660 sanity_test()