Coverage for trlc/lexer.py: 100%

1#!/usr/bin/env python3

3# TRLC - Treat Requirements Like Code

6# This file is part of the TRLC Python Reference Implementation.

8# TRLC is free software: you can redistribute it and/or modify it

9# under the terms of the GNU General Public License as published by

10# the Free Software Foundation, either version 3 of the License, or

11# (at your option) any later version.

12#

13# TRLC is distributed in the hope that it will be useful, but WITHOUT

14# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

15# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public

16# License for more details.

17#

18# You should have received a copy of the GNU General Public License

19# along with TRLC. If not, see <https://www.gnu.org/licenses/>.

21import sys

22from fractions import Fraction

23from abc import ABCMeta, abstractmethod

25from trlc.errors import Location, Message_Handler

28def triple_quoted_string_value(raw_value):

29 # lobster-trace: LRM.Complex_String_Value

30 assert isinstance(raw_value, str)

31 assert len(raw_value) >= 6

32 assert raw_value.startswith("'''") or raw_value.startswith('"""')

33 assert raw_value[:3] == raw_value[-3:]

35 lines = raw_value[3:-3].strip().splitlines()

36 if not lines:

37 return ""

39 non_empty_lines = [line for line in lines if line.strip()]

41 value = lines[0]

42 common_ws = ""

43 common_len = 0

44 if len(non_empty_lines) >= 2:

45 # The loop below cannot complete by construction

46 for c in non_empty_lines[1]: # pragma: no cover

47 if c not in (" \t"):

48 break

49 common_ws += c

50 common_len += 1

51 else:

52 return value

54 for line in lines[2:]:

55 if not line.strip():

56 continue

57 for idx in range(common_len):

58 if idx < len(line) and line[idx] == common_ws[idx]:

59 pass

60 else:

61 common_len = idx

62 break

64 for line in lines[1:]:

65 value += "\n" + line[common_len:].rstrip()

67 return value

70class Source_Reference(Location):

71 def __init__(self, lexer, start_line, start_col, start_pos, end_pos):

72 assert isinstance(lexer, TRLC_Lexer)

73 assert isinstance(start_line, int)

74 assert isinstance(start_col, int)

75 assert isinstance(start_pos, int)

76 assert isinstance(end_pos, int)

77 assert 0 <= start_pos <= end_pos < lexer.length

78 super().__init__(lexer.file_name,

79 start_line,

80 start_col)

81 self.lexer = lexer

82 self.start_pos = start_pos

83 self.end_pos = end_pos

85 def text(self):

86 return self.lexer.content[self.start_pos:self.end_pos + 1]

88 def context_lines(self):

89 line = ""

90 n = self.start_pos

91 while n >= 0:

92 if self.lexer.content[n] == "\n":

93 break

94 line = self.lexer.content[n] + line

95 n -= 1

96 offset = self.start_pos - n - 1

97 n = self.start_pos + 1

98 while n < self.lexer.length:

99 if self.lexer.content[n] == "\n":

100 break

101 line = line + self.lexer.content[n]

102 n += 1

103 maxtrail = n - self.start_pos

104 tlen = self.end_pos + 1 - self.start_pos

105

106 stripped_line = line.lstrip()

107 stripped_offset = offset - (len(line) - len(stripped_line))

108

109 return [stripped_line,

110 " " * stripped_offset + "^" * min(tlen, maxtrail)]

111

112 def get_end_location(self):

113 lines_in_between = self.lexer.content[

114 self.start_pos : self.end_pos + 1

115 ].count("\n")

116 end_line = self.line_no + lines_in_between

117

118 end_col = self.end_pos + 1

119 for n in range(self.end_pos, 1, -1):

120 if self.lexer.content[n] == "\n":

121 end_col = max(self.end_pos - n, 1)

122 break

123

124 return Location(self.file_name, end_line, end_col)

125

126

127class Token_Base:

128 def __init__(self, location, kind, value):

129 assert isinstance(location, Location)

130 assert isinstance(kind, str)

131 self.location = location

132 self.kind = kind

133 self.value = value

134

135

136class Token(Token_Base):

137 KIND = {

138 "COMMENT" : "comment",

139 "IDENTIFIER" : "identifier",

140 "KEYWORD" : "keyword",

141 "BRA" : "opening parenthesis '('",

142 "KET" : "closing parenthesis ')'",

143 "S_BRA" : "opening bracket '['",

144 "S_KET" : "closing bracket ']'",

145 "C_BRA" : "opening brace '{'",

146 "C_KET" : "closing brace '}'",

147 "COMMA" : "comma ','",

148 "AT" : "separtor '@'",

149 "SEMICOLON" : "separator ';'",

150 "COLON" : "separator ':'",

151 "DOT" : ".",

152 "RANGE" : "..",

153 "ASSIGN" : "=",

154 "OPERATOR" : "operator",

155 "ARROW" : "->",

156 "INTEGER" : "integer literal",

157 "DECIMAL" : "decimal literal",

158 "STRING" : "string literal",

159 }

160

161 def __init__(self, location, kind, value=None, ast_link=None):

162 assert kind in Token.KIND

163 if kind in ("COMMENT", "IDENTIFIER",

164 "KEYWORD", "OPERATOR", "STRING"):

165 assert isinstance(value, str)

166 elif kind == "INTEGER":

167 assert isinstance(value, int)

168 elif kind == "DECIMAL":

169 assert isinstance(value, Fraction)

170 else:

171 assert value is None

172 super().__init__(location, kind, value)

173 self.ast_link = ast_link

174

175 def __repr__(self):

176 if self.value is None:

177 return "%s_Token" % self.kind

178 else:

179 return "%s_Token(%s)" % (self.kind, self.value)

180

181

182class Lexer_Base(metaclass=ABCMeta):

183 def __init__(self, mh, content):

184 assert isinstance(mh, Message_Handler)

185 assert isinstance(content, str)

186 self.mh = mh

187 self.content = content

188 self.length = len(self.content)

189 self.tokens = []

190

191 self.lexpos = -3

192 self.line_no = 0

193 self.col_no = 0

194 self.cc = None

195 self.nc = None

196 self.nnc = None

197

198 self.advance()

199 self.advance()

200

201 @staticmethod

202 def is_alpha(char):

203 # lobster-trace: LRM.Identifier

204 return char.isascii() and char.isalpha()

205

206 @staticmethod

207 def is_numeric(char):

208 # lobster-trace: LRM.Integers

209 # lobster-trace: LRM.Decimals

210 return char.isascii() and char.isdigit()

211

212 @staticmethod

213 def is_alnum(char):

214 # lobster-trace: LRM.Identifier

215 return char.isascii() and char.isalnum()

216

217 @abstractmethod

218 def file_location(self):

219 pass

220

221 @abstractmethod

222 def token(self):

223 pass

224

225 def skip_whitespace(self):

226 # lobster-trace: LRM.Whitespace

227 while self.nc and self.nc.isspace():

228 self.advance()

229 self.advance()

230

231 def advance(self):

232 self.lexpos += 1

233 if self.cc == "\n" or self.lexpos == 0:

234 self.line_no += 1

235 self.col_no = 0

236 if self.nc is not None:

237 self.col_no += 1

238 self.cc = self.nc

239 self.nc = self.nnc

240 self.nnc = (self.content[self.lexpos + 2]

241 if self.lexpos + 2 < self.length

242 else None)

243

244

245class TRLC_Lexer(Lexer_Base):

246 KEYWORDS = frozenset([

247 "abs",

248 "abstract",

249 "and",

250 "checks",

251 "else",

252 "elsif",

253 "enum",

254 "error",

255 "exists",

256 "extends",

257 "false",

258 "fatal",

259 "final",

260 "forall",

261 "freeze",

262 "if",

263 "implies",

264 "import",

265 "in",

266 "not",

267 "null",

268 "optional",

269 "or",

270 "package",

271 "section",

272 "separator",

273 "then",

274 "true",

275 "tuple",

276 "type",

277 "warning",

278 "xor"

279 ])

280

281 PUNCTUATION = {

282 "(" : "BRA",

283 ")" : "KET",

284 "{" : "C_BRA",

285 "}" : "C_KET",

286 "[" : "S_BRA",

287 "]" : "S_KET",

288 "," : "COMMA",

289 "@" : "AT",

290 ":" : "COLON",

291 ";" : "SEMICOLON",

292 "/" : "OPERATOR",

293 "%" : "OPERATOR",

294 "+" : "OPERATOR",

295 "-" : "OPERATOR",

296 }

297

298 def __init__(self, mh, file_name, file_content=None):

299 assert isinstance(file_name, str)

300 assert isinstance(file_content, str) or file_content is None

301 self.file_name = file_name

302 if file_content is None:

303 # lobster-trace: LRM.File_Encoding

304 # lobster-trace: LRM.File_Encoding_Fixed

305 with open(file_name, "r", encoding="UTF-8") as fd:

306 try:

307 super().__init__(mh, fd.read())

308 except UnicodeDecodeError as err:

309 mh.lex_error(Location(file_name), str(err))

310 else:

311 super().__init__(mh, file_content)

312

313 def current_location(self):

314 # lobster-exclude: Utility function

315 return Source_Reference(lexer = self,

316 start_line = self.line_no,

317 start_col = self.col_no,

318 start_pos = self.lexpos,

319 end_pos = self.lexpos)

320

321 def file_location(self):

322 # lobster-exclude: Utility function

323 return Location(self.file_name, 1, 1)

324

325 def token(self):

326 # Skip whitespace and move to the next char

327 self.skip_whitespace()

328

329 # Return if we're done

330 if self.cc is None:

331 return None

332

333 start_pos = self.lexpos

334 start_line = self.line_no

335 start_col = self.col_no

336

337 if self.cc == "/" and self.nc == "/":

338 # lobster-trace: LRM.Comments

339 kind = "COMMENT"

340 while self.cc and self.nc != "\n":

341 self.advance()

342

343 elif self.cc == "/" and self.nc == "*":

344 # lobster-trace: LRM.Comments

345 kind = "COMMENT"

346 while self.nc and not (self.cc == "*" and self.nc == "/"):

347 self.advance()

348 self.advance()

349

350 elif self.is_alpha(self.cc):

351 # lobster-trace: LRM.Identifier

352 kind = "IDENTIFIER"

353 while self.nc and (self.is_alnum(self.nc) or

354 self.nc == "_"):

355 self.advance()

356

357 elif self.cc in TRLC_Lexer.PUNCTUATION:

358 # lobster-trace: LRM.Single_Delimiters

359 kind = TRLC_Lexer.PUNCTUATION[self.cc]

360

361 elif self.cc == "=":

362 # lobster-trace: LRM.Single_Delimiters

363 # lobster-trace: LRM.Double_Delimiters

364 # lobster-trace: LRM.Lexing_Disambiguation

365 if self.nc == ">":

366 kind = "ARROW"

367 self.advance()

368 elif self.nc == "=":

369 kind = "OPERATOR"

370 self.advance()

371 else:

372 kind = "ASSIGN"

373

374 elif self.cc == ".":

375 # lobster-trace: LRM.Single_Delimiters

376 # lobster-trace: LRM.Double_Delimiters

377 # lobster-trace: LRM.Lexing_Disambiguation

378 if self.nc == ".":

379 kind = "RANGE"

380 self.advance()

381 else:

382 kind = "DOT"

383

384 elif self.cc in ("<", ">"):

385 # lobster-trace: LRM.Single_Delimiters

386 # lobster-trace: LRM.Double_Delimiters

387 # lobster-trace: LRM.Lexing_Disambiguation

388 kind = "OPERATOR"

389 if self.nc == "=":

390 self.advance()

391

392 elif self.cc == "!":

393 # lobster-trace: LRM.Double_Delimiters

394 # lobster-trace: LRM.Lexing_Disambiguation

395 kind = "OPERATOR"

396 if self.nc == "=":

397 self.advance()

398 else:

399 self.mh.lex_error(self.current_location(),

400 "malformed != operator")

401

402 elif self.cc == "*":

403 # lobster-trace: LRM.Single_Delimiters

404 # lobster-trace: LRM.Double_Delimiters

405 # lobster-trace: LRM.Lexing_Disambiguation

406 kind = "OPERATOR"

407 if self.nc == "*":

408 self.advance()

409

410 elif self.cc == '"':

411 # lobster-trace: LRM.Strings

412 kind = "STRING"

413 if self.nc == '"' and self.nnc == '"':

414 self.advance()

415 self.advance()

416 quotes_seen = 0

417 while quotes_seen < 3:

418 self.advance()

419 if self.cc == '"':

420 quotes_seen += 1

421 else:

422 quotes_seen = 0

423 if self.nc is None:

424 self.mh.lex_error(

425 Source_Reference(lexer = self,

426 start_line = start_line,

427 start_col = start_col,

428 start_pos = start_pos,

429 end_pos = self.lexpos),

430 "unterminated triple-quoted string")

431 else:

432 while self.nc != '"':

433 if self.nc is None:

434 self.mh.lex_error(

435 Source_Reference(lexer = self,

436 start_line = start_line,

437 start_col = start_col,

438 start_pos = start_pos,

439 end_pos = self.lexpos),

440 "unterminated string")

441 elif self.nc == "\n":

442 self.mh.lex_error(

443 Source_Reference(lexer = self,

444 start_line = start_line,

445 start_col = start_col,

446 start_pos = start_pos,

447 end_pos = self.lexpos),

448 "double quoted strings cannot include newlines")

449

450 self.advance()

451 if self.cc == "\\" and self.nc == '"':

452 self.advance()

453 self.advance()

454

455 elif self.cc == "'":

456 # lobster-trace: LRM.Strings

457 kind = "STRING"

458 for _ in range(2):

459 self.advance()

460 if self.cc != "'":

461 self.mh.lex_error(

462 Source_Reference(lexer = self,

463 start_line = start_line,

464 start_col = start_col,

465 start_pos = start_pos,

466 end_pos = self.lexpos),

467 "malformed triple-quoted string")

468 quotes_seen = 0

469 while quotes_seen < 3:

470 self.advance()

471 if self.cc == "'":

472 quotes_seen += 1

473 else:

474 quotes_seen = 0

475 if self.nc is None:

476 self.mh.lex_error(

477 Source_Reference(lexer = self,

478 start_line = start_line,

479 start_col = start_col,

480 start_pos = start_pos,

481 end_pos = self.lexpos),

482 "unterminated triple-quoted string")

483

484 elif self.is_numeric(self.cc):

485 # lobster-trace: LRM.Integers

486 # lobster-trace: LRM.Decimals

487 kind = "INTEGER"

488

489 if self.cc == "0" and self.nc == "b":

490 digits_allowed = "01"

491 digits_forbidden = "23456789abcdefABCDEF"

492 int_base = 2

493 require_digit = True

494 decimal_allowed = False

495 self.advance()

496 elif self.cc == "0" and self.nc == "x":

497 digits_allowed = "0123456789abcdefABCDEF"

498 digits_forbidden = ""

499 int_base = 16

500 require_digit = True

501 decimal_allowed = False

502 self.advance()

503 else:

504 digits_allowed = "0123456789"

505 digits_forbidden = "abcdefABCDEF"

506 int_base = 10

507 require_digit = False

508 decimal_allowed = True

509

510 while self.nc:

511 if self.nc in digits_allowed:

512 self.advance()

513 require_digit = False

514

515 elif self.nc in digits_forbidden:

516 self.mh.lex_error(

517 Source_Reference(lexer = self,

518 start_line = start_line,

519 start_col = start_col,

520 start_pos = self.lexpos + 1,

521 end_pos = self.lexpos + 1),

522 "%s is not a valid base %u digit" % (self.nc,

523 int_base))

524

525 elif require_digit:

526 self.mh.lex_error(

527 Source_Reference(lexer = self,

528 start_line = start_line,

529 start_col = start_col,

530 start_pos = self.lexpos + 1,

531 end_pos = self.lexpos + 1),

532 "base %u digit is required here" % int_base)

533

534 elif self.nc == "_":

535 self.advance()

536 require_digit = True

537

538 elif self.nc == "." and self.nnc == ".":

539 # This is a range token, so that one can't be part

540 # of our number anymore

541 break

542

543 elif self.nc == ".":

544 self.advance()

545 if not decimal_allowed:

546 if int_base == 10:

547 msg = "decimal point is not allowed here"

548 else:

549 msg = ("base %u integer may not contain a"

550 " decimal point" % int_base)

551 self.mh.lex_error(

552 Source_Reference(lexer = self,

553 start_line = start_line,

554 start_col = start_col,

555 start_pos = self.lexpos,

556 end_pos = self.lexpos),

557 msg)

558 decimal_allowed = False

559 require_digit = True

560 kind = "DECIMAL"

561

562 else: # pragma: no cover

563 # This is actually a false

564 # alarm, this line is covered (it's the only

565 # normal way to exit this loop.

566 break

567

568 if require_digit:

569 self.mh.lex_error(

570 Source_Reference(lexer = self,

571 start_line = start_line,

572 start_col = start_col,

573 start_pos = start_pos,

574 end_pos = self.lexpos),

575 "unfinished base %u integer" % int_base)

576

577 else:

578 self.mh.lex_error(self.current_location(),

579 "unexpected character '%s'" % self.cc)

580

581 sref = Source_Reference(lexer = self,

582 start_line = start_line,

583 start_col = start_col,

584 start_pos = start_pos,

585 end_pos = min(self.lexpos, self.length - 1))

586

587 if kind == "IDENTIFIER":

588 value = sref.text()

589 if value in TRLC_Lexer.KEYWORDS:

590 # lobster-trace: LRM.TRLC_Keywords

591 kind = "KEYWORD"

592

593 elif kind == "OPERATOR":

594 value = sref.text()

595

596 elif kind == "STRING":

597 value = sref.text()

598 if value.startswith('"""'):

599 value = triple_quoted_string_value(value)

600 elif value.startswith('"'):

601 # lobster-trace: LRM.Simple_String_Value

602 value = value[1:-1]

603 value = value.replace('\\"', '"')

604 else:

605 value = triple_quoted_string_value(value)

606

607 elif kind == "INTEGER":

608 # lobster-trace: LRM.Integer_Values

609 base_text = sref.text().replace("_", "")

610 if int_base == 10:

611 value = int(base_text)

612 elif int_base == 2:

613 value = int(base_text[2:], base=2)

614 else:

615 value = int(base_text[2:], base=16)

616

617 elif kind == "DECIMAL":

618 # lobster-trace: LRM.Decimal_Values

619 value = Fraction(sref.text().replace("_", ""))

620

621 elif kind == "COMMENT":

622 value = sref.text()

623 if value.startswith("//"):

624 value = value[2:].strip()

625 else:

626 value = value[2:]

627 if value.endswith("*/"):

628 value = value[:-2]

629 value = value.strip()

630

631 else:

632 value = None

633

634 return Token(sref, kind, value)

635

636

637class Token_Stream(TRLC_Lexer):

638

639 def token(self):

640 tok = super().token()

641 if tok is not None:

642 self.tokens.append(tok)

643 return tok

644

645

646def sanity_test():

647 # lobster-exclude: Developer test function

648 mh = Message_Handler()

649 lexer = TRLC_Lexer(mh, sys.argv[1])

650

651 while True:

652 token = lexer.token()

653 if token is None:

654 break

655 mh.warning(token.location,

656 str(token))

657

658

659if __name__ == "__main__":

660 sanity_test()