Coverage for trlc/lexer.py: 100%

323 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2026-06-18 05:43 +0000

1#!/usr/bin/env python3 

2# 

3# TRLC - Treat Requirements Like Code 

4# Copyright (C) 2022-2023, 2026 Bayerische Motoren Werke Aktiengesellschaft (BMW AG) 

5# 

6# This file is part of the TRLC Python Reference Implementation. 

7# 

8# TRLC is free software: you can redistribute it and/or modify it 

9# under the terms of the GNU General Public License as published by 

10# the Free Software Foundation, either version 3 of the License, or 

11# (at your option) any later version. 

12# 

13# TRLC is distributed in the hope that it will be useful, but WITHOUT 

14# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 

15# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 

16# License for more details. 

17# 

18# You should have received a copy of the GNU General Public License 

19# along with TRLC. If not, see <https://www.gnu.org/licenses/>. 

20 

21from fractions import Fraction 

22from abc import ABCMeta, abstractmethod 

23 

24from trlc.errors import Location, Message_Handler 

25 

26 

27def triple_quoted_string_value(raw_value): 

28 # lobster-trace: LRM.Complex_String_Value 

29 assert isinstance(raw_value, str) 

30 assert len(raw_value) >= 6 

31 assert raw_value.startswith("'''") or raw_value.startswith('"""') 

32 assert raw_value[:3] == raw_value[-3:] 

33 

34 lines = raw_value[3:-3].strip().splitlines() 

35 if not lines: 

36 return "" 

37 

38 non_empty_lines = [line for line in lines if line.strip()] 

39 

40 value = lines[0] 

41 common_ws = "" 

42 common_len = 0 

43 if len(non_empty_lines) >= 2: 

44 # The loop below cannot complete by construction 

45 for c in non_empty_lines[1]: # pragma: no cover 

46 if c not in (" \t"): 

47 break 

48 common_ws += c 

49 common_len += 1 

50 else: 

51 return value 

52 

53 for line in lines[2:]: 

54 if not line.strip(): 

55 continue 

56 for idx in range(common_len): 

57 if idx < len(line) and line[idx] == common_ws[idx]: 

58 pass 

59 else: 

60 common_len = idx 

61 break 

62 

63 for line in lines[1:]: 

64 value += "\n" + line[common_len:].rstrip() 

65 

66 return value 

67 

68 

69class Source_Reference(Location): 

70 def __init__(self, lexer, start_line, start_col, start_pos, end_pos): 

71 assert isinstance(lexer, TRLC_Lexer) 

72 assert isinstance(start_line, int) 

73 assert isinstance(start_col, int) 

74 assert isinstance(start_pos, int) 

75 assert isinstance(end_pos, int) 

76 assert 0 <= start_pos <= end_pos < lexer.length 

77 super().__init__(lexer.file_name, 

78 start_line, 

79 start_col) 

80 self.lexer = lexer 

81 self.start_pos = start_pos 

82 self.end_pos = end_pos 

83 

84 def text(self): 

85 return self.lexer.content[self.start_pos:self.end_pos + 1] 

86 

87 def context_lines(self): 

88 line = "" 

89 n = self.start_pos 

90 while n >= 0: 

91 if self.lexer.content[n] == "\n": 

92 break 

93 line = self.lexer.content[n] + line 

94 n -= 1 

95 offset = self.start_pos - n - 1 

96 n = self.start_pos + 1 

97 while n < self.lexer.length: 

98 if self.lexer.content[n] == "\n": 

99 break 

100 line = line + self.lexer.content[n] 

101 n += 1 

102 maxtrail = n - self.start_pos 

103 tlen = self.end_pos + 1 - self.start_pos 

104 

105 stripped_line = line.lstrip() 

106 stripped_offset = offset - (len(line) - len(stripped_line)) 

107 

108 return [stripped_line, 

109 " " * stripped_offset + "^" * min(tlen, maxtrail)] 

110 

111 def get_end_location(self): 

112 lines_in_between = self.lexer.content[ 

113 self.start_pos : self.end_pos + 1 

114 ].count("\n") 

115 end_line = self.line_no + lines_in_between 

116 

117 end_col = self.end_pos + 1 

118 for n in range(self.end_pos, 1, -1): 

119 if self.lexer.content[n] == "\n": 

120 end_col = max(self.end_pos - n, 1) 

121 break 

122 

123 return Location(self.file_name, end_line, end_col) 

124 

125 

126class Token_Base: 

127 def __init__(self, location, kind, value): 

128 assert isinstance(location, Location) 

129 assert isinstance(kind, str) 

130 self.location = location 

131 self.kind = kind 

132 self.value = value 

133 

134 

135class Token(Token_Base): 

136 KIND = { 

137 "COMMENT" : "comment", 

138 "IDENTIFIER" : "identifier", 

139 "KEYWORD" : "keyword", 

140 "BRA" : "opening parenthesis '('", 

141 "KET" : "closing parenthesis ')'", 

142 "S_BRA" : "opening bracket '['", 

143 "S_KET" : "closing bracket ']'", 

144 "C_BRA" : "opening brace '{'", 

145 "C_KET" : "closing brace '}'", 

146 "COMMA" : "comma ','", 

147 "AT" : "separtor '@'", 

148 "SEMICOLON" : "separator ';'", 

149 "COLON" : "separator ':'", 

150 "DOT" : ".", 

151 "RANGE" : "..", 

152 "ASSIGN" : "=", 

153 "OPERATOR" : "operator", 

154 "ARROW" : "->", 

155 "INTEGER" : "integer literal", 

156 "DECIMAL" : "decimal literal", 

157 "STRING" : "string literal", 

158 } 

159 

160 def __init__(self, location, kind, value=None, ast_link=None): 

161 assert kind in Token.KIND 

162 if kind in ("COMMENT", "IDENTIFIER", 

163 "KEYWORD", "OPERATOR", "STRING"): 

164 assert isinstance(value, str) 

165 elif kind == "INTEGER": 

166 assert isinstance(value, int) 

167 elif kind == "DECIMAL": 

168 assert isinstance(value, Fraction) 

169 else: 

170 assert value is None 

171 super().__init__(location, kind, value) 

172 self.ast_link = ast_link 

173 

174 def __repr__(self): 

175 if self.value is None: 

176 return "%s_Token" % self.kind 

177 else: 

178 return "%s_Token(%s)" % (self.kind, self.value) 

179 

180 

181class Lexer_Base(metaclass=ABCMeta): 

182 def __init__(self, mh, content): 

183 assert isinstance(mh, Message_Handler) 

184 assert isinstance(content, str) 

185 self.mh = mh 

186 self.content = content 

187 self.length = len(self.content) 

188 self.tokens = [] 

189 

190 self.lexpos = -3 

191 self.line_no = 0 

192 self.col_no = 0 

193 self.cc = None 

194 self.nc = None 

195 self.nnc = None 

196 

197 self.advance() 

198 self.advance() 

199 

200 @staticmethod 

201 def is_alpha(char): 

202 # lobster-trace: LRM.Identifier 

203 return char.isascii() and char.isalpha() 

204 

205 @staticmethod 

206 def is_numeric(char): 

207 # lobster-trace: LRM.Integers 

208 # lobster-trace: LRM.Decimals 

209 return char.isascii() and char.isdigit() 

210 

211 @staticmethod 

212 def is_alnum(char): 

213 # lobster-trace: LRM.Identifier 

214 return char.isascii() and char.isalnum() 

215 

216 @abstractmethod 

217 def file_location(self): 

218 pass 

219 

220 @abstractmethod 

221 def token(self): 

222 pass 

223 

224 def skip_whitespace(self): 

225 # lobster-trace: LRM.Whitespace 

226 while self.nc and self.nc.isspace(): 

227 self.advance() 

228 self.advance() 

229 

230 def advance(self): 

231 self.lexpos += 1 

232 if self.cc == "\n" or self.lexpos == 0: 

233 self.line_no += 1 

234 self.col_no = 0 

235 if self.nc is not None: 

236 self.col_no += 1 

237 self.cc = self.nc 

238 self.nc = self.nnc 

239 self.nnc = (self.content[self.lexpos + 2] 

240 if self.lexpos + 2 < self.length 

241 else None) 

242 

243 

244class TRLC_Lexer(Lexer_Base): 

245 KEYWORDS = frozenset([ 

246 "abs", 

247 "abstract", 

248 "and", 

249 "checks", 

250 "else", 

251 "elsif", 

252 "enum", 

253 "error", 

254 "exists", 

255 "extends", 

256 "false", 

257 "fatal", 

258 "final", 

259 "forall", 

260 "freeze", 

261 "if", 

262 "implies", 

263 "import", 

264 "in", 

265 "not", 

266 "null", 

267 "optional", 

268 "or", 

269 "package", 

270 "section", 

271 "separator", 

272 "then", 

273 "true", 

274 "tuple", 

275 "type", 

276 "warning", 

277 "xor" 

278 ]) 

279 

280 PUNCTUATION = { 

281 "(" : "BRA", 

282 ")" : "KET", 

283 "{" : "C_BRA", 

284 "}" : "C_KET", 

285 "[" : "S_BRA", 

286 "]" : "S_KET", 

287 "," : "COMMA", 

288 "@" : "AT", 

289 ":" : "COLON", 

290 ";" : "SEMICOLON", 

291 "/" : "OPERATOR", 

292 "%" : "OPERATOR", 

293 "+" : "OPERATOR", 

294 "-" : "OPERATOR", 

295 } 

296 

297 def __init__(self, mh, file_name, file_content=None): 

298 assert isinstance(file_name, str) 

299 assert isinstance(file_content, str) or file_content is None 

300 self.file_name = file_name 

301 if file_content is None: 

302 # lobster-trace: LRM.File_Encoding 

303 # lobster-trace: LRM.File_Encoding_Fixed 

304 with open(file_name, "r", encoding="UTF-8") as fd: 

305 try: 

306 super().__init__(mh, fd.read()) 

307 except UnicodeDecodeError as err: 

308 mh.lex_error(Location(file_name), str(err)) 

309 else: 

310 super().__init__(mh, file_content) 

311 

312 def current_location(self): 

313 # lobster-exclude: Utility function 

314 return Source_Reference(lexer = self, 

315 start_line = self.line_no, 

316 start_col = self.col_no, 

317 start_pos = self.lexpos, 

318 end_pos = self.lexpos) 

319 

320 def file_location(self): 

321 # lobster-exclude: Utility function 

322 return Location(self.file_name, 1, 1) 

323 

324 def token(self): 

325 # Skip whitespace and move to the next char 

326 self.skip_whitespace() 

327 

328 # Return if we're done 

329 if self.cc is None: 

330 return None 

331 

332 start_pos = self.lexpos 

333 start_line = self.line_no 

334 start_col = self.col_no 

335 

336 if self.cc == "/" and self.nc == "/": 

337 # lobster-trace: LRM.Comments 

338 kind = "COMMENT" 

339 while self.cc and self.nc != "\n": 

340 self.advance() 

341 

342 elif self.cc == "/" and self.nc == "*": 

343 # lobster-trace: LRM.Comments 

344 kind = "COMMENT" 

345 while self.nc and not (self.cc == "*" and self.nc == "/"): 

346 self.advance() 

347 self.advance() 

348 

349 elif self.is_alpha(self.cc): 

350 # lobster-trace: LRM.Identifier 

351 kind = "IDENTIFIER" 

352 while self.nc and (self.is_alnum(self.nc) or 

353 self.nc == "_"): 

354 self.advance() 

355 

356 elif self.cc in TRLC_Lexer.PUNCTUATION: 

357 # lobster-trace: LRM.Single_Delimiters 

358 kind = TRLC_Lexer.PUNCTUATION[self.cc] 

359 

360 elif self.cc == "=": 

361 # lobster-trace: LRM.Single_Delimiters 

362 # lobster-trace: LRM.Double_Delimiters 

363 # lobster-trace: LRM.Lexing_Disambiguation 

364 if self.nc == ">": 

365 kind = "ARROW" 

366 self.advance() 

367 elif self.nc == "=": 

368 kind = "OPERATOR" 

369 self.advance() 

370 else: 

371 kind = "ASSIGN" 

372 

373 elif self.cc == ".": 

374 # lobster-trace: LRM.Single_Delimiters 

375 # lobster-trace: LRM.Double_Delimiters 

376 # lobster-trace: LRM.Lexing_Disambiguation 

377 if self.nc == ".": 

378 kind = "RANGE" 

379 self.advance() 

380 else: 

381 kind = "DOT" 

382 

383 elif self.cc in ("<", ">"): 

384 # lobster-trace: LRM.Single_Delimiters 

385 # lobster-trace: LRM.Double_Delimiters 

386 # lobster-trace: LRM.Lexing_Disambiguation 

387 kind = "OPERATOR" 

388 if self.nc == "=": 

389 self.advance() 

390 

391 elif self.cc == "!": 

392 # lobster-trace: LRM.Double_Delimiters 

393 # lobster-trace: LRM.Lexing_Disambiguation 

394 kind = "OPERATOR" 

395 if self.nc == "=": 

396 self.advance() 

397 else: 

398 self.mh.lex_error(self.current_location(), 

399 "malformed != operator") 

400 

401 elif self.cc == "*": 

402 # lobster-trace: LRM.Single_Delimiters 

403 # lobster-trace: LRM.Double_Delimiters 

404 # lobster-trace: LRM.Lexing_Disambiguation 

405 kind = "OPERATOR" 

406 if self.nc == "*": 

407 self.advance() 

408 

409 elif self.cc == '"': 

410 # lobster-trace: LRM.Strings 

411 kind = "STRING" 

412 if self.nc == '"' and self.nnc == '"': 

413 self.advance() 

414 self.advance() 

415 quotes_seen = 0 

416 while quotes_seen < 3: 

417 self.advance() 

418 if self.cc == '"': 

419 quotes_seen += 1 

420 else: 

421 quotes_seen = 0 

422 if self.nc is None: 

423 self.mh.lex_error( 

424 Source_Reference(lexer = self, 

425 start_line = start_line, 

426 start_col = start_col, 

427 start_pos = start_pos, 

428 end_pos = self.lexpos), 

429 "unterminated triple-quoted string") 

430 else: 

431 while self.nc != '"': 

432 if self.nc is None: 

433 self.mh.lex_error( 

434 Source_Reference(lexer = self, 

435 start_line = start_line, 

436 start_col = start_col, 

437 start_pos = start_pos, 

438 end_pos = self.lexpos), 

439 "unterminated string") 

440 elif self.nc == "\n": 

441 self.mh.lex_error( 

442 Source_Reference(lexer = self, 

443 start_line = start_line, 

444 start_col = start_col, 

445 start_pos = start_pos, 

446 end_pos = self.lexpos), 

447 "double quoted strings cannot include newlines") 

448 

449 self.advance() 

450 if self.cc == "\\" and self.nc == '"': 

451 self.advance() 

452 self.advance() 

453 

454 elif self.cc == "'": 

455 # lobster-trace: LRM.Strings 

456 kind = "STRING" 

457 for _ in range(2): 

458 self.advance() 

459 if self.cc != "'": 

460 self.mh.lex_error( 

461 Source_Reference(lexer = self, 

462 start_line = start_line, 

463 start_col = start_col, 

464 start_pos = start_pos, 

465 end_pos = self.lexpos), 

466 "malformed triple-quoted string") 

467 quotes_seen = 0 

468 while quotes_seen < 3: 

469 self.advance() 

470 if self.cc == "'": 

471 quotes_seen += 1 

472 else: 

473 quotes_seen = 0 

474 if self.nc is None: 

475 self.mh.lex_error( 

476 Source_Reference(lexer = self, 

477 start_line = start_line, 

478 start_col = start_col, 

479 start_pos = start_pos, 

480 end_pos = self.lexpos), 

481 "unterminated triple-quoted string") 

482 

483 elif self.is_numeric(self.cc): 

484 # lobster-trace: LRM.Integers 

485 # lobster-trace: LRM.Decimals 

486 kind = "INTEGER" 

487 

488 if self.cc == "0" and self.nc == "b": 

489 digits_allowed = "01" 

490 digits_forbidden = "23456789abcdefABCDEF" 

491 int_base = 2 

492 require_digit = True 

493 decimal_allowed = False 

494 self.advance() 

495 elif self.cc == "0" and self.nc == "x": 

496 digits_allowed = "0123456789abcdefABCDEF" 

497 digits_forbidden = "" 

498 int_base = 16 

499 require_digit = True 

500 decimal_allowed = False 

501 self.advance() 

502 else: 

503 digits_allowed = "0123456789" 

504 digits_forbidden = "abcdefABCDEF" 

505 int_base = 10 

506 require_digit = False 

507 decimal_allowed = True 

508 

509 while self.nc: 

510 if self.nc in digits_allowed: 

511 self.advance() 

512 require_digit = False 

513 

514 elif self.nc in digits_forbidden: 

515 self.mh.lex_error( 

516 Source_Reference(lexer = self, 

517 start_line = start_line, 

518 start_col = start_col, 

519 start_pos = self.lexpos + 1, 

520 end_pos = self.lexpos + 1), 

521 "%s is not a valid base %u digit" % (self.nc, 

522 int_base)) 

523 

524 elif require_digit: 

525 self.mh.lex_error( 

526 Source_Reference(lexer = self, 

527 start_line = start_line, 

528 start_col = start_col, 

529 start_pos = self.lexpos + 1, 

530 end_pos = self.lexpos + 1), 

531 "base %u digit is required here" % int_base) 

532 

533 elif self.nc == "_": 

534 self.advance() 

535 require_digit = True 

536 

537 elif self.nc == "." and self.nnc == ".": 

538 # This is a range token, so that one can't be part 

539 # of our number anymore 

540 break 

541 

542 elif self.nc == ".": 

543 self.advance() 

544 if not decimal_allowed: 

545 if int_base == 10: 

546 msg = "decimal point is not allowed here" 

547 else: 

548 msg = ("base %u integer may not contain a" 

549 " decimal point" % int_base) 

550 self.mh.lex_error( 

551 Source_Reference(lexer = self, 

552 start_line = start_line, 

553 start_col = start_col, 

554 start_pos = self.lexpos, 

555 end_pos = self.lexpos), 

556 msg) 

557 decimal_allowed = False 

558 require_digit = True 

559 kind = "DECIMAL" 

560 

561 else: # pragma: no cover 

562 # This is actually a false 

563 # alarm, this line is covered (it's the only 

564 # normal way to exit this loop. 

565 break 

566 

567 if require_digit: 

568 self.mh.lex_error( 

569 Source_Reference(lexer = self, 

570 start_line = start_line, 

571 start_col = start_col, 

572 start_pos = start_pos, 

573 end_pos = self.lexpos), 

574 "unfinished base %u integer" % int_base) 

575 

576 else: 

577 self.mh.lex_error(self.current_location(), 

578 "unexpected character '%s'" % self.cc) 

579 

580 sref = Source_Reference(lexer = self, 

581 start_line = start_line, 

582 start_col = start_col, 

583 start_pos = start_pos, 

584 end_pos = min(self.lexpos, self.length - 1)) 

585 

586 if kind == "IDENTIFIER": 

587 value = sref.text() 

588 if value in TRLC_Lexer.KEYWORDS: 

589 # lobster-trace: LRM.TRLC_Keywords 

590 kind = "KEYWORD" 

591 

592 elif kind == "OPERATOR": 

593 value = sref.text() 

594 

595 elif kind == "STRING": 

596 value = sref.text() 

597 if value.startswith('"""'): 

598 value = triple_quoted_string_value(value) 

599 elif value.startswith('"'): 

600 # lobster-trace: LRM.Simple_String_Value 

601 value = value[1:-1] 

602 value = value.replace('\\"', '"') 

603 else: 

604 value = triple_quoted_string_value(value) 

605 

606 elif kind == "INTEGER": 

607 # lobster-trace: LRM.Integer_Values 

608 base_text = sref.text().replace("_", "") 

609 if int_base == 10: 

610 value = int(base_text) 

611 elif int_base == 2: 

612 value = int(base_text[2:], base=2) 

613 else: 

614 value = int(base_text[2:], base=16) 

615 

616 elif kind == "DECIMAL": 

617 # lobster-trace: LRM.Decimal_Values 

618 value = Fraction(sref.text().replace("_", "")) 

619 

620 elif kind == "COMMENT": 

621 value = sref.text() 

622 if value.startswith("//"): 

623 value = value[2:].strip() 

624 else: 

625 value = value[2:] 

626 if value.endswith("*/"): 

627 value = value[:-2] 

628 value = value.strip() 

629 

630 else: 

631 value = None 

632 

633 return Token(sref, kind, value) 

634 

635 

636class Token_Stream(TRLC_Lexer): 

637 

638 def token(self): 

639 tok = super().token() 

640 if tok is not None: 

641 self.tokens.append(tok) 

642 return tok