202 lines
No EOL
7 KiB
Python
202 lines
No EOL
7 KiB
Python
#! /usr/bin/python
|
|
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
"""This class implements a lexical analyser capable
|
|
of consuming BASIC statements and commands and returning
|
|
a corresponding list of tokens.
|
|
|
|
>>> lexer = Lexer()
|
|
>>> tokenlist = lexer.tokenize('100 LET I = 10')
|
|
>>> tokenlist[0].pretty_print()
|
|
Column: 0 Category: UNSIGNEDINT Lexeme: 100
|
|
>>> tokenlist = lexer.tokenize('100 IF I <> 10')
|
|
>>> tokenlist[3].pretty_print()
|
|
Column: 9 Category: NOTEQUAL Lexeme: <>
|
|
>>> tokenlist = lexer.tokenize('100 LET I = 3.45')
|
|
>>> tokenlist[4].pretty_print()
|
|
Column: 12 Category: UNSIGNEDFLOAT Lexeme: 3.45
|
|
>>> tokenlist = lexer.tokenize('100 LET I = "HELLO"')
|
|
>>> tokenlist[4].pretty_print()
|
|
Column: 12 Category: STRING Lexeme: HELLO
|
|
"""
|
|
|
|
from basictoken import BASICToken as Token
|
|
|
|
|
|
class Lexer:
|
|
|
|
def __init__(self):
|
|
|
|
self.__column = 0 # Current column number
|
|
self.__stmt = '' # Statement string being processed
|
|
|
|
def tokenize(self, stmt):
|
|
"""Returns a list of tokens obtained by
|
|
lexical analysis of the specified
|
|
statement.
|
|
|
|
"""
|
|
self.__stmt = stmt
|
|
self.__column = 0
|
|
|
|
# Establish a list of tokens to be
|
|
# derived from the statement
|
|
tokenlist = []
|
|
|
|
# Process every character until we
|
|
# reach the end of the statement string
|
|
c = self.__get_next_char()
|
|
while c != '':
|
|
|
|
# Skip any preceding whitespace
|
|
while c.isspace():
|
|
c = self.__get_next_char()
|
|
|
|
# Construct a token, column count already
|
|
# incremented
|
|
token = Token(self.__column - 1, None, '')
|
|
|
|
# Process strings
|
|
if c == '"':
|
|
token.category = Token.STRING
|
|
|
|
# Consume all of the characters
|
|
# until we reach the terminating
|
|
# quote. Do not store the quotes
|
|
# in the lexeme
|
|
c = self.__get_next_char() # Advance past opening quote
|
|
|
|
# We explicitly support empty strings
|
|
if c == '"':
|
|
# String is empty, leave lexeme as ''
|
|
# and advance past terminating quote
|
|
c = self.__get_next_char()
|
|
|
|
else:
|
|
while True:
|
|
token.lexeme += c # Append the current char to the lexeme
|
|
c = self.__get_next_char()
|
|
|
|
if c == '':
|
|
raise SyntaxError("Mismatched quotes")
|
|
|
|
if c == '"':
|
|
c = self.__get_next_char() # Advance past terminating quote
|
|
break
|
|
|
|
# Process numbers
|
|
elif c.isdigit() or c == '.':
|
|
token.category = Token.UNSIGNEDINT
|
|
found_point = False
|
|
if c == '.':
|
|
token.category = Token.UNSIGNEDFLOAT
|
|
found_point = True
|
|
|
|
# Consume all of the digits, including any decimal point
|
|
while True:
|
|
token.lexeme += c # Append the current char to the lexeme
|
|
c = self.__get_next_char()
|
|
|
|
# Break if next character is not a digit
|
|
# and this is not the first decimal point
|
|
if not c.isdigit():
|
|
if c == '.':
|
|
if found_point is False:
|
|
found_point = True
|
|
token.category = Token.UNSIGNEDFLOAT
|
|
|
|
else:
|
|
# Another decimal point found
|
|
break
|
|
|
|
else:
|
|
break
|
|
|
|
# Process keywords and names
|
|
elif c.isalpha():
|
|
# Consume all of the letters
|
|
while True:
|
|
token.lexeme += c # append the current char to the lexeme
|
|
c = self.__get_next_char()
|
|
|
|
# Break if not a letter or a dollar symbol
|
|
# (the latter is used for string variable names)
|
|
if not ((c.isalpha() or c.isdigit()) or c == '_' or c == '$'):
|
|
break
|
|
|
|
# Normalise keywords and names to upper case
|
|
token.lexeme = token.lexeme.upper()
|
|
|
|
# Determine if the lexeme is a variable name or a
|
|
# reserved word
|
|
if token.lexeme in Token.keywords:
|
|
token.category = Token.keywords[token.lexeme]
|
|
|
|
else:
|
|
token.category = Token.NAME
|
|
|
|
# Remark Statements - process rest of statement without checks
|
|
if token.lexeme == "REM":
|
|
while c!= '':
|
|
token.lexeme += c # Append the current char to the lexeme
|
|
c = self.__get_next_char()
|
|
|
|
# Process operator symbols
|
|
elif c in Token.smalltokens:
|
|
save = c
|
|
c = self.__get_next_char() # c might be '' (end of stmt)
|
|
twochar = save + c
|
|
|
|
if twochar in Token.smalltokens:
|
|
token.category = Token.smalltokens[twochar]
|
|
token.lexeme = twochar
|
|
c = self.__get_next_char() # Move past end of token
|
|
|
|
else:
|
|
# One char token
|
|
token.category = Token.smalltokens[save]
|
|
token.lexeme = save
|
|
|
|
# We do not recognise this token
|
|
elif c != '':
|
|
raise SyntaxError('Syntax error')
|
|
|
|
# Append the new token to the list
|
|
tokenlist.append(token)
|
|
|
|
return tokenlist
|
|
|
|
def __get_next_char(self):
|
|
"""Returns the next character in the
|
|
statement, unless the last character has already
|
|
been processed, in which case, the empty string is
|
|
returned.
|
|
|
|
"""
|
|
if self.__column < len(self.__stmt):
|
|
next_char = self.__stmt[self.__column]
|
|
self.__column = self.__column + 1
|
|
|
|
return next_char
|
|
|
|
else:
|
|
return ''
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import doctest
|
|
doctest.testmod() |