Coverage for trlc/lexer.py: 100%

324 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2025-03-27 00:52 +0000

1#!/usr/bin/env python3 

2# 

3# TRLC - Treat Requirements Like Code 

4# Copyright (C) 2022-2023 Bayerische Motoren Werke Aktiengesellschaft (BMW AG) 

5# 

6# This file is part of the TRLC Python Reference Implementation. 

7# 

8# TRLC is free software: you can redistribute it and/or modify it 

9# under the terms of the GNU General Public License as published by 

10# the Free Software Foundation, either version 3 of the License, or 

11# (at your option) any later version. 

12# 

13# TRLC is distributed in the hope that it will be useful, but WITHOUT 

14# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 

15# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 

16# License for more details. 

17# 

18# You should have received a copy of the GNU General Public License 

19# along with TRLC. If not, see <https://www.gnu.org/licenses/>. 

20 

21import sys 

22from fractions import Fraction 

23from abc import ABCMeta, abstractmethod 

24 

25from trlc.errors import Location, Message_Handler 

26 

27 

28def triple_quoted_string_value(raw_value): 

29 # lobster-trace: LRM.Complex_String_Value 

30 assert isinstance(raw_value, str) 

31 assert len(raw_value) >= 6 

32 assert raw_value.startswith("'''") or raw_value.startswith('"""') 

33 assert raw_value[:3] == raw_value[-3:] 

34 

35 lines = raw_value[3:-3].strip().splitlines() 

36 if not lines: 

37 return "" 

38 

39 non_empty_lines = [line for line in lines if line.strip()] 

40 

41 value = lines[0] 

42 common_ws = "" 

43 common_len = 0 

44 if len(non_empty_lines) >= 2: 

45 # The loop below cannot complete by construction 

46 for c in non_empty_lines[1]: # pragma: no cover 

47 if c not in (" \t"): 

48 break 

49 common_ws += c 

50 common_len += 1 

51 else: 

52 return value 

53 

54 for line in lines[2:]: 

55 if not line.strip(): 

56 continue 

57 for idx in range(common_len): 

58 if idx < len(line) and line[idx] == common_ws[idx]: 

59 pass 

60 else: 

61 common_len = idx 

62 break 

63 

64 for line in lines[1:]: 

65 value += "\n" + line[common_len:].rstrip() 

66 

67 return value 

68 

69 

70class Source_Reference(Location): 

71 def __init__(self, lexer, start_line, start_col, start_pos, end_pos): 

72 assert isinstance(lexer, TRLC_Lexer) 

73 assert isinstance(start_line, int) 

74 assert isinstance(start_col, int) 

75 assert isinstance(start_pos, int) 

76 assert isinstance(end_pos, int) 

77 assert 0 <= start_pos <= end_pos < lexer.length 

78 super().__init__(lexer.file_name, 

79 start_line, 

80 start_col) 

81 self.lexer = lexer 

82 self.start_pos = start_pos 

83 self.end_pos = end_pos 

84 

85 def text(self): 

86 return self.lexer.content[self.start_pos:self.end_pos + 1] 

87 

88 def context_lines(self): 

89 line = "" 

90 n = self.start_pos 

91 while n >= 0: 

92 if self.lexer.content[n] == "\n": 

93 break 

94 line = self.lexer.content[n] + line 

95 n -= 1 

96 offset = self.start_pos - n - 1 

97 n = self.start_pos + 1 

98 while n < self.lexer.length: 

99 if self.lexer.content[n] == "\n": 

100 break 

101 line = line + self.lexer.content[n] 

102 n += 1 

103 maxtrail = n - self.start_pos 

104 tlen = self.end_pos + 1 - self.start_pos 

105 

106 stripped_line = line.lstrip() 

107 stripped_offset = offset - (len(line) - len(stripped_line)) 

108 

109 return [stripped_line, 

110 " " * stripped_offset + "^" * min(tlen, maxtrail)] 

111 

112 def get_end_location(self): 

113 lines_in_between = self.lexer.content[ 

114 self.start_pos : self.end_pos + 1 

115 ].count("\n") 

116 end_line = self.line_no + lines_in_between 

117 

118 end_col = self.end_pos + 1 

119 for n in range(self.end_pos, 1, -1): 

120 if self.lexer.content[n] == "\n": 

121 end_col = max(self.end_pos - n, 1) 

122 break 

123 

124 return Location(self.file_name, end_line, end_col) 

125 

126 

127class Token_Base: 

128 def __init__(self, location, kind, value): 

129 assert isinstance(location, Location) 

130 assert isinstance(kind, str) 

131 self.location = location 

132 self.kind = kind 

133 self.value = value 

134 

135 

136class Token(Token_Base): 

137 KIND = { 

138 "COMMENT" : "comment", 

139 "IDENTIFIER" : "identifier", 

140 "KEYWORD" : "keyword", 

141 "BRA" : "opening parenthesis '('", 

142 "KET" : "closing parenthesis ')'", 

143 "S_BRA" : "opening bracket '['", 

144 "S_KET" : "closing bracket ']'", 

145 "C_BRA" : "opening brace '{'", 

146 "C_KET" : "closing brace '}'", 

147 "COMMA" : "comma ','", 

148 "AT" : "separtor '@'", 

149 "SEMICOLON" : "separator ';'", 

150 "COLON" : "separator ':'", 

151 "DOT" : ".", 

152 "RANGE" : "..", 

153 "ASSIGN" : "=", 

154 "OPERATOR" : "operator", 

155 "ARROW" : "->", 

156 "INTEGER" : "integer literal", 

157 "DECIMAL" : "decimal literal", 

158 "STRING" : "string literal", 

159 } 

160 

161 def __init__(self, location, kind, value=None, ast_link=None): 

162 assert kind in Token.KIND 

163 if kind in ("COMMENT", "IDENTIFIER", 

164 "KEYWORD", "OPERATOR", "STRING"): 

165 assert isinstance(value, str) 

166 elif kind == "INTEGER": 

167 assert isinstance(value, int) 

168 elif kind == "DECIMAL": 

169 assert isinstance(value, Fraction) 

170 else: 

171 assert value is None 

172 super().__init__(location, kind, value) 

173 self.ast_link = ast_link 

174 

175 def __repr__(self): 

176 if self.value is None: 

177 return "%s_Token" % self.kind 

178 else: 

179 return "%s_Token(%s)" % (self.kind, self.value) 

180 

181 

182class Lexer_Base(metaclass=ABCMeta): 

183 def __init__(self, mh, content): 

184 assert isinstance(mh, Message_Handler) 

185 assert isinstance(content, str) 

186 self.mh = mh 

187 self.content = content 

188 self.length = len(self.content) 

189 self.tokens = [] 

190 

191 self.lexpos = -3 

192 self.line_no = 0 

193 self.col_no = 0 

194 self.cc = None 

195 self.nc = None 

196 self.nnc = None 

197 

198 self.advance() 

199 self.advance() 

200 

201 @staticmethod 

202 def is_alpha(char): 

203 # lobster-trace: LRM.Identifier 

204 return char.isascii() and char.isalpha() 

205 

206 @staticmethod 

207 def is_numeric(char): 

208 # lobster-trace: LRM.Integers 

209 # lobster-trace: LRM.Decimals 

210 return char.isascii() and char.isdigit() 

211 

212 @staticmethod 

213 def is_alnum(char): 

214 # lobster-trace: LRM.Identifier 

215 return char.isascii() and char.isalnum() 

216 

217 @abstractmethod 

218 def file_location(self): 

219 pass 

220 

221 @abstractmethod 

222 def token(self): 

223 pass 

224 

225 def skip_whitespace(self): 

226 # lobster-trace: LRM.Whitespace 

227 while self.nc and self.nc.isspace(): 

228 self.advance() 

229 self.advance() 

230 

231 def advance(self): 

232 self.lexpos += 1 

233 if self.cc == "\n" or self.lexpos == 0: 

234 self.line_no += 1 

235 self.col_no = 0 

236 if self.nc is not None: 

237 self.col_no += 1 

238 self.cc = self.nc 

239 self.nc = self.nnc 

240 self.nnc = (self.content[self.lexpos + 2] 

241 if self.lexpos + 2 < self.length 

242 else None) 

243 

244 

245class TRLC_Lexer(Lexer_Base): 

246 KEYWORDS = frozenset([ 

247 "abs", 

248 "abstract", 

249 "and", 

250 "checks", 

251 "else", 

252 "elsif", 

253 "enum", 

254 "error", 

255 "exists", 

256 "extends", 

257 "false", 

258 "fatal", 

259 "final", 

260 "forall", 

261 "freeze", 

262 "if", 

263 "implies", 

264 "import", 

265 "in", 

266 "not", 

267 "null", 

268 "optional", 

269 "or", 

270 "package", 

271 "section", 

272 "separator", 

273 "then", 

274 "true", 

275 "tuple", 

276 "type", 

277 "warning", 

278 "xor" 

279 ]) 

280 

281 PUNCTUATION = { 

282 "(" : "BRA", 

283 ")" : "KET", 

284 "{" : "C_BRA", 

285 "}" : "C_KET", 

286 "[" : "S_BRA", 

287 "]" : "S_KET", 

288 "," : "COMMA", 

289 "@" : "AT", 

290 ":" : "COLON", 

291 ";" : "SEMICOLON", 

292 "/" : "OPERATOR", 

293 "%" : "OPERATOR", 

294 "+" : "OPERATOR", 

295 "-" : "OPERATOR", 

296 } 

297 

298 def __init__(self, mh, file_name, file_content=None): 

299 assert isinstance(file_name, str) 

300 assert isinstance(file_content, str) or file_content is None 

301 self.file_name = file_name 

302 if file_content is None: 

303 # lobster-trace: LRM.File_Encoding 

304 # lobster-trace: LRM.File_Encoding_Fixed 

305 with open(file_name, "r", encoding="UTF-8") as fd: 

306 try: 

307 super().__init__(mh, fd.read()) 

308 except UnicodeDecodeError as err: 

309 mh.lex_error(Location(file_name), str(err)) 

310 else: 

311 super().__init__(mh, file_content) 

312 

313 def current_location(self): 

314 # lobster-exclude: Utility function 

315 return Source_Reference(lexer = self, 

316 start_line = self.line_no, 

317 start_col = self.col_no, 

318 start_pos = self.lexpos, 

319 end_pos = self.lexpos) 

320 

321 def file_location(self): 

322 # lobster-exclude: Utility function 

323 return Location(self.file_name, 1, 1) 

324 

325 def token(self): 

326 # Skip whitespace and move to the next char 

327 self.skip_whitespace() 

328 

329 # Return if we're done 

330 if self.cc is None: 

331 return None 

332 

333 start_pos = self.lexpos 

334 start_line = self.line_no 

335 start_col = self.col_no 

336 

337 if self.cc == "/" and self.nc == "/": 

338 # lobster-trace: LRM.Comments 

339 kind = "COMMENT" 

340 while self.cc and self.nc != "\n": 

341 self.advance() 

342 

343 elif self.cc == "/" and self.nc == "*": 

344 # lobster-trace: LRM.Comments 

345 kind = "COMMENT" 

346 while self.nc and not (self.cc == "*" and self.nc == "/"): 

347 self.advance() 

348 self.advance() 

349 

350 elif self.is_alpha(self.cc): 

351 # lobster-trace: LRM.Identifier 

352 kind = "IDENTIFIER" 

353 while self.nc and (self.is_alnum(self.nc) or 

354 self.nc == "_"): 

355 self.advance() 

356 

357 elif self.cc in TRLC_Lexer.PUNCTUATION: 

358 # lobster-trace: LRM.Single_Delimiters 

359 kind = TRLC_Lexer.PUNCTUATION[self.cc] 

360 

361 elif self.cc == "=": 

362 # lobster-trace: LRM.Single_Delimiters 

363 # lobster-trace: LRM.Double_Delimiters 

364 # lobster-trace: LRM.Lexing_Disambiguation 

365 if self.nc == ">": 

366 kind = "ARROW" 

367 self.advance() 

368 elif self.nc == "=": 

369 kind = "OPERATOR" 

370 self.advance() 

371 else: 

372 kind = "ASSIGN" 

373 

374 elif self.cc == ".": 

375 # lobster-trace: LRM.Single_Delimiters 

376 # lobster-trace: LRM.Double_Delimiters 

377 # lobster-trace: LRM.Lexing_Disambiguation 

378 if self.nc == ".": 

379 kind = "RANGE" 

380 self.advance() 

381 else: 

382 kind = "DOT" 

383 

384 elif self.cc in ("<", ">"): 

385 # lobster-trace: LRM.Single_Delimiters 

386 # lobster-trace: LRM.Double_Delimiters 

387 # lobster-trace: LRM.Lexing_Disambiguation 

388 kind = "OPERATOR" 

389 if self.nc == "=": 

390 self.advance() 

391 

392 elif self.cc == "!": 

393 # lobster-trace: LRM.Double_Delimiters 

394 # lobster-trace: LRM.Lexing_Disambiguation 

395 kind = "OPERATOR" 

396 if self.nc == "=": 

397 self.advance() 

398 else: 

399 self.mh.lex_error(self.current_location(), 

400 "malformed != operator") 

401 

402 elif self.cc == "*": 

403 # lobster-trace: LRM.Single_Delimiters 

404 # lobster-trace: LRM.Double_Delimiters 

405 # lobster-trace: LRM.Lexing_Disambiguation 

406 kind = "OPERATOR" 

407 if self.nc == "*": 

408 self.advance() 

409 

410 elif self.cc == '"': 

411 # lobster-trace: LRM.Strings 

412 kind = "STRING" 

413 if self.nc == '"' and self.nnc == '"': 

414 self.advance() 

415 self.advance() 

416 quotes_seen = 0 

417 while quotes_seen < 3: 

418 self.advance() 

419 if self.cc == '"': 

420 quotes_seen += 1 

421 else: 

422 quotes_seen = 0 

423 if self.nc is None: 

424 self.mh.lex_error( 

425 Source_Reference(lexer = self, 

426 start_line = start_line, 

427 start_col = start_col, 

428 start_pos = start_pos, 

429 end_pos = self.lexpos), 

430 "unterminated triple-quoted string") 

431 else: 

432 while self.nc != '"': 

433 if self.nc is None: 

434 self.mh.lex_error( 

435 Source_Reference(lexer = self, 

436 start_line = start_line, 

437 start_col = start_col, 

438 start_pos = start_pos, 

439 end_pos = self.lexpos), 

440 "unterminated string") 

441 elif self.nc == "\n": 

442 self.mh.lex_error( 

443 Source_Reference(lexer = self, 

444 start_line = start_line, 

445 start_col = start_col, 

446 start_pos = start_pos, 

447 end_pos = self.lexpos), 

448 "double quoted strings cannot include newlines") 

449 

450 self.advance() 

451 if self.cc == "\\" and self.nc == '"': 

452 self.advance() 

453 self.advance() 

454 

455 elif self.cc == "'": 

456 # lobster-trace: LRM.Strings 

457 kind = "STRING" 

458 for _ in range(2): 

459 self.advance() 

460 if self.cc != "'": 

461 self.mh.lex_error( 

462 Source_Reference(lexer = self, 

463 start_line = start_line, 

464 start_col = start_col, 

465 start_pos = start_pos, 

466 end_pos = self.lexpos), 

467 "malformed triple-quoted string") 

468 quotes_seen = 0 

469 while quotes_seen < 3: 

470 self.advance() 

471 if self.cc == "'": 

472 quotes_seen += 1 

473 else: 

474 quotes_seen = 0 

475 if self.nc is None: 

476 self.mh.lex_error( 

477 Source_Reference(lexer = self, 

478 start_line = start_line, 

479 start_col = start_col, 

480 start_pos = start_pos, 

481 end_pos = self.lexpos), 

482 "unterminated triple-quoted string") 

483 

484 elif self.is_numeric(self.cc): 

485 # lobster-trace: LRM.Integers 

486 # lobster-trace: LRM.Decimals 

487 kind = "INTEGER" 

488 

489 if self.cc == "0" and self.nc == "b": 

490 digits_allowed = "01" 

491 digits_forbidden = "23456789abcdefABCDEF" 

492 int_base = 2 

493 require_digit = True 

494 decimal_allowed = False 

495 self.advance() 

496 elif self.cc == "0" and self.nc == "x": 

497 digits_allowed = "0123456789abcdefABCDEF" 

498 digits_forbidden = "" 

499 int_base = 16 

500 require_digit = True 

501 decimal_allowed = False 

502 self.advance() 

503 else: 

504 digits_allowed = "0123456789" 

505 digits_forbidden = "abcdefABCDEF" 

506 int_base = 10 

507 require_digit = False 

508 decimal_allowed = True 

509 

510 while self.nc: 

511 if self.nc in digits_allowed: 

512 self.advance() 

513 require_digit = False 

514 

515 elif self.nc in digits_forbidden: 

516 self.mh.lex_error( 

517 Source_Reference(lexer = self, 

518 start_line = start_line, 

519 start_col = start_col, 

520 start_pos = self.lexpos + 1, 

521 end_pos = self.lexpos + 1), 

522 "%s is not a valid base %u digit" % (self.nc, 

523 int_base)) 

524 

525 elif require_digit: 

526 self.mh.lex_error( 

527 Source_Reference(lexer = self, 

528 start_line = start_line, 

529 start_col = start_col, 

530 start_pos = self.lexpos + 1, 

531 end_pos = self.lexpos + 1), 

532 "base %u digit is required here" % int_base) 

533 

534 elif self.nc == "_": 

535 self.advance() 

536 require_digit = True 

537 

538 elif self.nc == "." and self.nnc == ".": 

539 # This is a range token, so that one can't be part 

540 # of our number anymore 

541 break 

542 

543 elif self.nc == ".": 

544 self.advance() 

545 if not decimal_allowed: 

546 if int_base == 10: 

547 msg = "decimal point is not allowed here" 

548 else: 

549 msg = ("base %u integer may not contain a" 

550 " decimal point" % int_base) 

551 self.mh.lex_error( 

552 Source_Reference(lexer = self, 

553 start_line = start_line, 

554 start_col = start_col, 

555 start_pos = self.lexpos, 

556 end_pos = self.lexpos), 

557 msg) 

558 decimal_allowed = False 

559 require_digit = True 

560 kind = "DECIMAL" 

561 

562 else: # pragma: no cover 

563 # This is actually a false 

564 # alarm, this line is covered (it's the only 

565 # normal way to exit this loop. 

566 break 

567 

568 if require_digit: 

569 self.mh.lex_error( 

570 Source_Reference(lexer = self, 

571 start_line = start_line, 

572 start_col = start_col, 

573 start_pos = start_pos, 

574 end_pos = self.lexpos), 

575 "unfinished base %u integer" % int_base) 

576 

577 else: 

578 self.mh.lex_error(self.current_location(), 

579 "unexpected character '%s'" % self.cc) 

580 

581 sref = Source_Reference(lexer = self, 

582 start_line = start_line, 

583 start_col = start_col, 

584 start_pos = start_pos, 

585 end_pos = min(self.lexpos, self.length - 1)) 

586 

587 if kind == "IDENTIFIER": 

588 value = sref.text() 

589 if value in TRLC_Lexer.KEYWORDS: 

590 # lobster-trace: LRM.TRLC_Keywords 

591 kind = "KEYWORD" 

592 

593 elif kind == "OPERATOR": 

594 value = sref.text() 

595 

596 elif kind == "STRING": 

597 value = sref.text() 

598 if value.startswith('"""'): 

599 value = triple_quoted_string_value(value) 

600 elif value.startswith('"'): 

601 # lobster-trace: LRM.Simple_String_Value 

602 value = value[1:-1] 

603 value = value.replace('\\"', '"') 

604 else: 

605 value = triple_quoted_string_value(value) 

606 

607 elif kind == "INTEGER": 

608 # lobster-trace: LRM.Integer_Values 

609 base_text = sref.text().replace("_", "") 

610 if int_base == 10: 

611 value = int(base_text) 

612 elif int_base == 2: 

613 value = int(base_text[2:], base=2) 

614 else: 

615 value = int(base_text[2:], base=16) 

616 

617 elif kind == "DECIMAL": 

618 # lobster-trace: LRM.Decimal_Values 

619 value = Fraction(sref.text().replace("_", "")) 

620 

621 elif kind == "COMMENT": 

622 value = sref.text() 

623 if value.startswith("//"): 

624 value = value[2:].strip() 

625 else: 

626 value = value[2:] 

627 if value.endswith("*/"): 

628 value = value[:-2] 

629 value = value.strip() 

630 

631 else: 

632 value = None 

633 

634 return Token(sref, kind, value) 

635 

636 

637class Token_Stream(TRLC_Lexer): 

638 

639 def token(self): 

640 tok = super().token() 

641 if tok is not None: 

642 self.tokens.append(tok) 

643 return tok 

644 

645 

646def sanity_test(): 

647 # lobster-exclude: Developer test function 

648 mh = Message_Handler() 

649 lexer = TRLC_Lexer(mh, sys.argv[1]) 

650 

651 while True: 

652 token = lexer.token() 

653 if token is None: 

654 break 

655 mh.warning(token.location, 

656 str(token)) 

657 

658 

659if __name__ == "__main__": 

660 sanity_test()