Fruit-Jam-OS/builtin_apps/PyBasic/lexer.py
2025-08-06 22:54:14 -04:00

202 lines
No EOL
7 KiB
Python

#! /usr/bin/python
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""This class implements a lexical analyser capable
of consuming BASIC statements and commands and returning
a corresponding list of tokens.
>>> lexer = Lexer()
>>> tokenlist = lexer.tokenize('100 LET I = 10')
>>> tokenlist[0].pretty_print()
Column: 0 Category: UNSIGNEDINT Lexeme: 100
>>> tokenlist = lexer.tokenize('100 IF I <> 10')
>>> tokenlist[3].pretty_print()
Column: 9 Category: NOTEQUAL Lexeme: <>
>>> tokenlist = lexer.tokenize('100 LET I = 3.45')
>>> tokenlist[4].pretty_print()
Column: 12 Category: UNSIGNEDFLOAT Lexeme: 3.45
>>> tokenlist = lexer.tokenize('100 LET I = "HELLO"')
>>> tokenlist[4].pretty_print()
Column: 12 Category: STRING Lexeme: HELLO
"""
from basictoken import BASICToken as Token
class Lexer:
def __init__(self):
self.__column = 0 # Current column number
self.__stmt = '' # Statement string being processed
def tokenize(self, stmt):
"""Returns a list of tokens obtained by
lexical analysis of the specified
statement.
"""
self.__stmt = stmt
self.__column = 0
# Establish a list of tokens to be
# derived from the statement
tokenlist = []
# Process every character until we
# reach the end of the statement string
c = self.__get_next_char()
while c != '':
# Skip any preceding whitespace
while c.isspace():
c = self.__get_next_char()
# Construct a token, column count already
# incremented
token = Token(self.__column - 1, None, '')
# Process strings
if c == '"':
token.category = Token.STRING
# Consume all of the characters
# until we reach the terminating
# quote. Do not store the quotes
# in the lexeme
c = self.__get_next_char() # Advance past opening quote
# We explicitly support empty strings
if c == '"':
# String is empty, leave lexeme as ''
# and advance past terminating quote
c = self.__get_next_char()
else:
while True:
token.lexeme += c # Append the current char to the lexeme
c = self.__get_next_char()
if c == '':
raise SyntaxError("Mismatched quotes")
if c == '"':
c = self.__get_next_char() # Advance past terminating quote
break
# Process numbers
elif c.isdigit() or c == '.':
token.category = Token.UNSIGNEDINT
found_point = False
if c == '.':
token.category = Token.UNSIGNEDFLOAT
found_point = True
# Consume all of the digits, including any decimal point
while True:
token.lexeme += c # Append the current char to the lexeme
c = self.__get_next_char()
# Break if next character is not a digit
# and this is not the first decimal point
if not c.isdigit():
if c == '.':
if found_point is False:
found_point = True
token.category = Token.UNSIGNEDFLOAT
else:
# Another decimal point found
break
else:
break
# Process keywords and names
elif c.isalpha():
# Consume all of the letters
while True:
token.lexeme += c # append the current char to the lexeme
c = self.__get_next_char()
# Break if not a letter or a dollar symbol
# (the latter is used for string variable names)
if not ((c.isalpha() or c.isdigit()) or c == '_' or c == '$'):
break
# Normalise keywords and names to upper case
token.lexeme = token.lexeme.upper()
# Determine if the lexeme is a variable name or a
# reserved word
if token.lexeme in Token.keywords:
token.category = Token.keywords[token.lexeme]
else:
token.category = Token.NAME
# Remark Statements - process rest of statement without checks
if token.lexeme == "REM":
while c!= '':
token.lexeme += c # Append the current char to the lexeme
c = self.__get_next_char()
# Process operator symbols
elif c in Token.smalltokens:
save = c
c = self.__get_next_char() # c might be '' (end of stmt)
twochar = save + c
if twochar in Token.smalltokens:
token.category = Token.smalltokens[twochar]
token.lexeme = twochar
c = self.__get_next_char() # Move past end of token
else:
# One char token
token.category = Token.smalltokens[save]
token.lexeme = save
# We do not recognise this token
elif c != '':
raise SyntaxError('Syntax error')
# Append the new token to the list
tokenlist.append(token)
return tokenlist
def __get_next_char(self):
"""Returns the next character in the
statement, unless the last character has already
been processed, in which case, the empty string is
returned.
"""
if self.__column < len(self.__stmt):
next_char = self.__stmt[self.__column]
self.__column = self.__column + 1
return next_char
else:
return ''
if __name__ == "__main__":
import doctest
doctest.testmod()