commit 8520d2b4bcd1d0a0d85806c035a2b3884fec4030 Author: Amit Patel Date: Wed Sep 9 13:52:11 2009 -0700 Initial version in git diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..ceaa0ca --- /dev/null +++ b/ChangeLog @@ -0,0 +1,108 @@ +2003-08-27 Amit Patel + + * *: (VERSION) Release 2.1.1 + + * *: Added a test/ directory for test cases; I had previously put + tests in the examples/ directory, which is a bad place to put + them. Examples are useful for learning how Yapps works. Tests + are for testing specific features of Yapps. + + * parsetree.py (Plus.update): Fixed a long-standing bug in which + the FOLLOW set of 'a'+ would include 'a'. In theory this makes no + practical difference because the 'a'+ rule eats up all the 'a' + tokens anyway. However, it makes error messages a little bit more + confusing because they imply that an 'a' can follow. + + * yappsrt.py (print_error): Incorporated the context object into + the printing of error messages. + +2003-08-12 Amit Patel + + * *: (VERSION) Release 2.1.0 + + * parsetree.py: Improved error message generation. Instead of + relying on the scanner to produce errors, the parser now checks + things explicitly and produces errors directly. The parser has + better knowledge of the context, so its error messages are more + precise and helpful. + + * yapps_grammar.g: Instead of setting self.rule in the setup() + method, pass it in the constructor. To make it available at + construction time, pass it along as another attribute in the + attribute grammar. + +2003-08-11 Amit Patel + + * parsetree.py: Generated parsers now include a context object + that describes the parse rule stack. For example, while parsing + rule A, called from rule B, called from rule D, the context object + will let you reconstruct the path D > B > A. [Thanks David Morley] + + * *: Removed all output when things are working + properly; all warnings/errors now go to stderr. + + * yapps_grammar.g: Added support for A? meaning an optional A. + This is equivalent to [A]. + + * yapps2.py: Design - refactored yapps2.py into yapps2.py + + grammar.py + parsetree.py. grammar.py is automatically generated + from grammar.g. Added lots of docstrings. + +2003-08-09 Amit Patel + + * yapps2.py: Documentation - added doctest tests to some of the + set algorithms in class Generator. + + * yapps2.py: Style - removed "import *" everywhere. + + * yapps2.py: Style - moved to Python 2 -- string methods, + list comprehensions, inline syntax for apply + +2003-07-28 Amit Patel + + * *: (VERSION) Release 2.0.4 + + * yappsrt.py: Style - replaced raising string exceptions + with raising class exceptions. [Thanks Alex Verstak] + + * yappsrt.py: (SyntaxError) Bug fix - SyntaxError.__init__ should + call Exception.__init__ + + * yapps2.py: Bug fix - identifiers in grammar rules that had + digits in them were not accessible in the {{python code}} sections + of the grammar. + + * yapps2.py: Style - changed "b >= a and b < c" to "a <= b < c" + + * yapps2.py: Style - change "`expr`" to "repr(expr)" + +2002-08-00 Amit Patel + + * *: (VERSION) Release 2.0.3 + + * yapps2.py: Bug fix - inline tokens using the r"" syntax weren't + treated properly. + +2002-04-00 Amit Patel + + * *: (VERSION) Release 2.0.2 + + * yapps2.py: Bug fix - when generating the "else" clause, if the + comment was too long, Yapps was not emitting a newline. [Thanks + Steven Engelhardt] + +2001-10-00 Amit Patel + + * *: (VERSION) Release 2.0.1 + + * yappsrt.py: (SyntaxError) Style - the exception classes now + inherit from Exception. [Thanks Rich Salz] + + * yappsrt.py: (Scanner) Performance - instead of passing the set + of tokens into the scanner at initialization time, we build the + list at compile time. You can still override the default list per + instance of the scanner, but in the common case, we don't have to + rebuild the token list. [Thanks Amaury Forgeot d'Arc] + + + \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..87fd179 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ + + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/NOTES b/NOTES new file mode 100644 index 0000000..b6fe941 --- /dev/null +++ b/NOTES @@ -0,0 +1,78 @@ +[Last updated August 11, 2003] + +Notes for myself: + +Document the LINENO trick + +Add a way to have a self-contained mode that doesn't require yappsrt? + +Add a debugging mode that helps you understand how the grammar + is constructed and how things are being parsed + +Optimize (remove) unused variables + +Yapps produces a bunch of inline list literals. We should be able to + instead create these lists as class variables (but this makes it + harder to read the code). Also, 'A in X' could be written + 'X.has_key(A)' if we can convert the lists into dictionaries ahead + of time. + +Add a convenience to automatically gather up the values returned + from subpatterns, put them into a list, and return them + +"Gather" mode that simply outputs the return values for certain nodes. + For example, if you just want all expressions, you could ask yapps + to gather the results of the 'expr' rule into a list. This would + ignore all the higher level structure. + +Improve the documentation + +Write some larger examples (probably XML/HTML) + +EOF needs to be dealt with. It's probably a token that can match anywhere. + +Get rid of old-style regex support + +Use SRE's lex support to speed up lexing (this may be hard given that + yapps allows for context-sensitive lexers) + +Look over Dan Connoly's experience with Yapps (bugs, frustrations, etc.) + and see what improvements could be made + +Add something to pretty-print the grammar (without the actions) + +Maybe conditionals? Follow this rule only if holds. + But this would be useful mainly when multiple rules match, and we + want the first matching rule. The conditional would mean we skip to + the next rule. Maybe this is part of the attribute grammar system, + where rule X<0> can be specified separately from X. + +Convenience functions that could build return values for all rules + without specifying the code for each rule individually + +Patterns (abstractions over rules) -- for example, comma separated values + have a certain rule pattern that gets replicated all over the place + + These are rules that take other rules as parameters. + + rule list: {{ result = [] }} + [ element {{ result.append(element) }} + ( separator element {{ result.append(element) }} + )* + ] {{ return result }} + +Inheritance of parser and scanner classes. The base class (Parser) + may define common tokens like ID, STR, NUM, space, comments, EOF, + etc., and common rules/patterns like optional, sequence, + delimiter-separated sequence. + +Why do A? and (A | ) produce different code? It seems that they +should produce the very same code. + +Look at everyone's Yapps grammars, and come up with larger examples + http://www.w3.org/2000/10/swap/SemEnglish.g + http://www.w3.org/2000/10/swap/kifExpr.g + http://www.w3.org/2000/10/swap/rdfn3.g + +Construct lots of erroneous grammars and see what Yapps does with them + (improve error reporting) diff --git a/examples/calc.g b/examples/calc.g new file mode 100644 index 0000000..8bdea43 --- /dev/null +++ b/examples/calc.g @@ -0,0 +1,64 @@ +# This calculator supports the usual (numbers, add, subtract, +# multiply, divide), global variables (stored in a global variable in +# Python), and local variables (stored in an attribute passed around +# in the grammar). + + +globalvars = {} # We will store the calculator's variables here + +def lookup(map, name): + for x,v in map: + if x == name: return v + if not globalvars.has_key(name): print 'Undefined (defaulting to 0):', name + return globalvars.get(name, 0) + +%% +parser Calculator: + ignore: "[ \r\t\n]+" + token END: "$" + token NUM: "[0-9]+" + token VAR: "[a-zA-Z_]+" + + # Each line can either be an expression or an assignment statement + rule goal: expr<<[]>> END {{ print '=', expr }} + {{ return expr }} + | "set" VAR expr<<[]>> END {{ globalvars[VAR] = expr }} + {{ print VAR, '=', expr }} + {{ return expr }} + + # An expression is the sum and difference of factors + rule expr<>: factor<> {{ n = factor }} + ( "[+]" factor<> {{ n = n+factor }} + | "-" factor<> {{ n = n-factor }} + )* {{ return n }} + + # A factor is the product and division of terms + rule factor<>: term<> {{ v = term }} + ( "[*]" term<> {{ v = v*term }} + | "/" term<> {{ v = v/term }} + )* {{ return v }} + + # A term is a number, variable, or an expression surrounded by parentheses + rule term<>: + NUM {{ return atoi(NUM) }} + | VAR {{ return lookup(V, VAR) }} + | "\\(" expr "\\)" {{ return expr }} + | "let" VAR "=" expr<> {{ V = [(VAR, expr)] + V }} + "in" expr<> {{ return expr }} +%% +if __name__=='__main__': + print 'Welcome to the calculator sample for Yapps 2.' + print ' Enter either "" or "set ",' + print ' or just press return to exit. An expression can have' + print ' local variables: let x = expr in expr' + # We could have put this loop into the parser, by making the + # `goal' rule use (expr | set var expr)*, but by putting the + # loop into Python code, we can make it interactive (i.e., enter + # one expression, get the result, enter another expression, etc.) + while 1: + try: s = raw_input('>>> ') + except EOFError: break + if not strip(s): break + parse('goal', s) + print 'Bye.' + diff --git a/examples/expr.g b/examples/expr.g new file mode 100644 index 0000000..ae807b7 --- /dev/null +++ b/examples/expr.g @@ -0,0 +1,21 @@ +parser Calculator: + token END: "$" + token NUM: "[0-9]+" + + rule goal: expr END {{ return expr }} + + # An expression is the sum and difference of factors + rule expr: factor {{ v = factor }} + ( "[+]" factor {{ v = v+factor }} + | "-" factor {{ v = v-factor }} + )* {{ return v }} + + # A factor is the product and division of terms + rule factor: term {{ v = term }} + ( "[*]" term {{ v = v*term }} + | "/" term {{ v = v/term }} + )* {{ return v }} + + # A term is either a number or an expression surrounded by parentheses + rule term: NUM {{ return atoi(NUM) }} + | "\\(" expr "\\)" {{ return expr }} diff --git a/examples/lisp.g b/examples/lisp.g new file mode 100644 index 0000000..634195c --- /dev/null +++ b/examples/lisp.g @@ -0,0 +1,15 @@ +# This parser can parse a simple subset of Lisp's syntax. + +parser Lisp: + ignore: r'\s+' + token NUM: r'[0-9]+' + token ID: r'[-+*/!@$%^&=.a-zA-Z0-9_]+' + token STR: r'"([^\\"]+|\\.)*"' + + rule expr: ID {{ return ('id',ID) }} + | STR {{ return ('str',eval(STR)) }} + | NUM {{ return ('num',int(NUM)) }} + | r"\(" + {{ e = [] }} # initialize the list + ( expr {{ e.append(expr) }} ) * # put each expr into the list + r"\)" {{ return e }} # return the list diff --git a/examples/xml.g b/examples/xml.g new file mode 100644 index 0000000..cec2709 --- /dev/null +++ b/examples/xml.g @@ -0,0 +1,66 @@ +#!/usr/bin/python2 + +# xml.g +# +# Amit J. Patel, August 2003 +# +# Simple (non-conforming, non-validating) parsing of XML documents, +# based on Robert D. Cameron's "REX" shallow parser. It doesn't +# handle CDATA and lots of other stuff; it's meant to demonstrate +# Yapps, not replace a proper XML parser. + +%% + +parser xml: + token nodetext: r'[^<>]+' + token attrtext_singlequote: "[^']*" + token attrtext_doublequote: '[^"]*' + token SP: r'\s' + token id: r'[a-zA-Z_:][a-zA-Z0-9_:.-]*' + + rule node: + r'' {{ return ['!--comment'] }} + | r'' {{ return ['![CDATA['] }} + | r']*>' {{ return ['!doctype'] }} + | '<' SP* id SP* attributes SP* {{ startid = id }} + ( '>' nodes '' {{ assert startid == id, 'Mismatched tags <%s> ... ' % (startid, id) }} + {{ return [id, attributes] + nodes }} + | '/\s*>' {{ return [id, attributes] }} + ) + | nodetext {{ return nodetext }} + + rule nodes: {{ result = [] }} + ( node {{ result.append(node) }} + ) * {{ return result }} + + rule attribute: id SP* '=' SP* + ( '"' attrtext_doublequote '"' {{ return (id, attrtext_doublequote) }} + | "'" attrtext_singlequote "'" {{ return (id, attrtext_singlequote) }} + ) + + rule attributes: {{ result = {} }} + ( attribute SP* {{ result[attribute[0]] = attribute[1] }} + ) * {{ return result }} + +%% + +if __name__ == '__main__': + tests = ['', + 'some text', + '< bad xml', + '
', + '< spacey a = "foo" / >', + 'text ... ', + ' middle ', + ' foo bar ', + ] + print + print '____Running tests_______________________________________' + for test in tests: + print + try: + parser = xml(xmlScanner(test)) + output = '%s ==> %s' % (repr(test), repr(parser.node())) + except (yappsrt.SyntaxError, AssertionError), e: + output = '%s ==> FAILED ==> %s' % (repr(test), e) + print output diff --git a/grammar.py b/grammar.py new file mode 100644 index 0000000..3bd2a86 --- /dev/null +++ b/grammar.py @@ -0,0 +1,234 @@ +#!/usr/bin/python2 +# +# grammar.py, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# +# This version of the Yapps 2 grammar can be distributed under the +# terms of the MIT open source license, either found in the LICENSE +# file included with the Yapps distribution +# or at +# +# + +"""Parser for Yapps grammars. + +This file defines the grammar of Yapps grammars. Naturally, it is +implemented in Yapps. The grammar.py module needed by Yapps is built +by running Yapps on yapps_grammar.g. (Holy circularity, Batman!) + +""" + +import sys, re +import parsetree + +###################################################################### +def cleanup_choice(rule, lst): + if len(lst) == 0: return Sequence(rule, []) + if len(lst) == 1: return lst[0] + return parsetree.Choice(rule, *tuple(lst)) + +def cleanup_sequence(rule, lst): + if len(lst) == 1: return lst[0] + return parsetree.Sequence(rule, *tuple(lst)) + +def resolve_name(rule, tokens, id, args): + if id in [x[0] for x in tokens]: + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return parsetree.Terminal(rule, id) + else: + # It's a name, so assume it's a nonterminal + return parsetree.NonTerminal(rule, id, args) + + +# Begin -- grammar generated by Yapps +import sys, re +import yappsrt + +class ParserDescriptionScanner(yappsrt.Scanner): + patterns = [ + ('"rule"', re.compile('rule')), + ('"ignore"', re.compile('ignore')), + ('"token"', re.compile('token')), + ('"option"', re.compile('option')), + ('":"', re.compile(':')), + ('"parser"', re.compile('parser')), + ('[ \t\r\n]+', re.compile('[ \t\r\n]+')), + ('#.*?\r?\n', re.compile('#.*?\r?\n')), + ('EOF', re.compile('$')), + ('ATTR', re.compile('<<.+?>>')), + ('STMT', re.compile('{{.+?}}')), + ('ID', re.compile('[a-zA-Z_][a-zA-Z_0-9]*')), + ('STR', re.compile('[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"')), + ('LP', re.compile('\\(')), + ('RP', re.compile('\\)')), + ('LB', re.compile('\\[')), + ('RB', re.compile('\\]')), + ('OR', re.compile('[|]')), + ('STAR', re.compile('[*]')), + ('PLUS', re.compile('[+]')), + ('QUEST', re.compile('[?]')), + ('COLON', re.compile(':')), + ] + def __init__(self, str): + yappsrt.Scanner.__init__(self,None,['[ \t\r\n]+', '#.*?\r?\n'],str) + +class ParserDescription(yappsrt.Parser): + Context = yappsrt.Context + def LINENO(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'LINENO', []) + return 1 + self._scanner.get_input_scanned().count('\n') + + def Parser(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'Parser', []) + self._scan('"parser"') + ID = self._scan('ID') + self._scan('":"') + Options = self.Options(_context) + Tokens = self.Tokens(_context) + Rules = self.Rules(Tokens, _context) + EOF = self._scan('EOF') + return parsetree.Generator(ID,Options,Tokens,Rules) + + def Options(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'Options', []) + opt = {} + while self._peek() == '"option"': + self._scan('"option"') + self._scan('":"') + Str = self.Str(_context) + opt[Str] = 1 + if self._peek() not in ['"option"', '"token"', '"ignore"', 'EOF', '"rule"']: + raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['"option"', '"token"', '"ignore"', 'EOF', '"rule"'])) + return opt + + def Tokens(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'Tokens', []) + tok = [] + while self._peek() in ['"token"', '"ignore"']: + _token = self._peek() + if _token == '"token"': + self._scan('"token"') + ID = self._scan('ID') + self._scan('":"') + Str = self.Str(_context) + tok.append( (ID,Str) ) + elif _token == '"ignore"': + self._scan('"ignore"') + self._scan('":"') + Str = self.Str(_context) + tok.append( ('#ignore',Str) ) + else: + raise yappsrt.SyntaxError(_token[0], 'Could not match Tokens') + if self._peek() not in ['"token"', '"ignore"', 'EOF', '"rule"']: + raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['"token"', '"ignore"', 'EOF', '"rule"'])) + return tok + + def Rules(self, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'Rules', [tokens]) + rul = [] + while self._peek() == '"rule"': + LINENO = self.LINENO(_context) + self._scan('"rule"') + ID = self._scan('ID') + OptParam = self.OptParam(_context) + self._scan('":"') + ClauseA = self.ClauseA(ID, tokens, _context) + rul.append( (ID, OptParam, ClauseA) ) + if self._peek() not in ['"rule"', 'EOF']: + raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['"rule"', 'EOF'])) + return rul + + def ClauseA(self, rule, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'ClauseA', [rule, tokens]) + ClauseB = self.ClauseB(rule, tokens, _context) + v = [ClauseB] + while self._peek() == 'OR': + OR = self._scan('OR') + ClauseB = self.ClauseB(rule, tokens, _context) + v.append(ClauseB) + if self._peek() not in ['OR', 'RP', 'RB', '"rule"', 'EOF']: + raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['OR', 'RP', 'RB', '"rule"', 'EOF'])) + return cleanup_choice(rule, v) + + def ClauseB(self, rule, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'ClauseB', [rule, tokens]) + v = [] + while self._peek() in ['STR', 'ID', 'LP', 'LB', 'STMT']: + ClauseC = self.ClauseC(rule, tokens, _context) + v.append(ClauseC) + if self._peek() not in ['STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'EOF']: + raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'EOF'])) + return cleanup_sequence(rule, v) + + def ClauseC(self, rule, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'ClauseC', [rule, tokens]) + ClauseD = self.ClauseD(rule, tokens, _context) + _token = self._peek() + if _token == 'PLUS': + PLUS = self._scan('PLUS') + return parsetree.Plus(rule, ClauseD) + elif _token == 'STAR': + STAR = self._scan('STAR') + return parsetree.Star(rule, ClauseD) + elif _token == 'QUEST': + QUEST = self._scan('QUEST') + return parsetree.Option(rule, ClauseD) + elif _token not in ['"ignore"', '"token"', '"option"', '":"', '"parser"', 'ATTR', 'COLON']: + return ClauseD + else: + raise yappsrt.SyntaxError(_token[0], 'Could not match ClauseC') + + def ClauseD(self, rule, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'ClauseD', [rule, tokens]) + _token = self._peek() + if _token == 'STR': + STR = self._scan('STR') + t = (STR, eval(STR,{},{})) + if t not in tokens: tokens.insert( 0, t ) + return parsetree.Terminal(rule, STR) + elif _token == 'ID': + ID = self._scan('ID') + OptParam = self.OptParam(_context) + return resolve_name(rule, tokens, ID, OptParam) + elif _token == 'LP': + LP = self._scan('LP') + ClauseA = self.ClauseA(rule, tokens, _context) + RP = self._scan('RP') + return ClauseA + elif _token == 'LB': + LB = self._scan('LB') + ClauseA = self.ClauseA(rule, tokens, _context) + RB = self._scan('RB') + return parsetree.Option(rule, ClauseA) + elif _token == 'STMT': + STMT = self._scan('STMT') + return parsetree.Eval(rule, STMT[2:-2]) + else: + raise yappsrt.SyntaxError(_token[0], 'Could not match ClauseD') + + def OptParam(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'OptParam', []) + _token = self._peek() + if _token == 'ATTR': + ATTR = self._scan('ATTR') + return ATTR[2:-2] + elif _token not in ['"ignore"', '"token"', '"option"', '"parser"', 'COLON']: + return '' + else: + raise yappsrt.SyntaxError(_token[0], 'Could not match OptParam') + + def Str(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'Str', []) + STR = self._scan('STR') + return eval(STR,{},{}) + + +def parse(rule, text): + P = ParserDescription(ParserDescriptionScanner(text)) + return yappsrt.wrap_error_reporter(P, rule) + +# End -- grammar generated by Yapps + + diff --git a/parsetree.py b/parsetree.py new file mode 100644 index 0000000..8affd60 --- /dev/null +++ b/parsetree.py @@ -0,0 +1,645 @@ +#!/usr/bin/python2 +# +# parsetree.py, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# +# This version of the Yapps 2 Runtime can be distributed under the +# terms of the MIT open source license, either found in the LICENSE file +# included with the Yapps distribution +# or at +# +# + +"""Classes used to represent parse trees and generate output. + +This module defines the Generator class, which drives the generation +of Python output from a grammar parse tree. It also defines nodes +used to represent the parse tree; they are derived from class Node. + +The main logic of Yapps is in this module. +""" + +import sys, re + +###################################################################### +INDENT = ' '*4 +class Generator: + + # TODO: many of the methods here should be class methods, not instance methods + + def __init__(self, name, options, tokens, rules): + self.change_count = 0 + self.name = name + self.options = options + self.preparser = '' + self.postparser = None + + self.tokens = {} # Map from tokens to regexps + self.ignore = [] # List of token names to ignore in parsing + self.terminals = [] # List of token names (to maintain ordering) + for n, t in tokens: + if n == '#ignore': + n = t + self.ignore.append(n) + if n in self.tokens.keys() and self.tokens[n] != t: + print >>sys.stderr, 'Warning: token %s defined more than once.' % n + self.tokens[n] = t + self.terminals.append(n) + + self.rules = {} # Map from rule names to parser nodes + self.params = {} # Map from rule names to parameters + self.goals = [] # List of rule names (to maintain ordering) + for n,p,r in rules: + self.params[n] = p + self.rules[n] = r + self.goals.append(n) + + self.output = sys.stdout + + def has_option(self, name): + return self.options.get(name, 0) + + def non_ignored_tokens(self): + return [x for x in self.terminals if x not in self.ignore] + + def changed(self): + """Increments the change count. + + >>> t = Generator('', [], [], []) + >>> old_count = t.change_count + >>> t.changed() + >>> assert t.change_count == old_count + 1 + """ + self.change_count = 1+self.change_count + + def set_subtract(self, a, b): + """Returns the elements of a that are not in b. + + >>> t = Generator('', [], [], []) + >>> t.set_subtract([], []) + [] + >>> t.set_subtract([1, 2], [1, 2]) + [] + >>> t.set_subtract([1, 2, 3], [2]) + [1, 3] + >>> t.set_subtract([1], [2, 3, 4]) + [1] + """ + result = [] + for x in a: + if x not in b: + result.append(x) + return result + + def subset(self, a, b): + """True iff all elements of sequence a are inside sequence b + + >>> t = Generator('', [], [], []) + >>> t.subset([], [1, 2, 3]) + 1 + >>> t.subset([1, 2, 3], []) + 0 + >>> t.subset([1], [1, 2, 3]) + 1 + >>> t.subset([3, 2, 1], [1, 2, 3]) + 1 + >>> t.subset([1, 1, 1], [1, 2, 3]) + 1 + >>> t.subset([1, 2, 3], [1, 1, 1]) + 0 + """ + for x in a: + if x not in b: + return 0 + return 1 + + def equal_set(self, a, b): + """True iff subset(a, b) and subset(b, a) + + >>> t = Generator('', [], [], []) + >>> a_set = [1, 2, 3] + >>> t.equal_set(a_set, a_set) + 1 + >>> t.equal_set(a_set, a_set[:]) + 1 + >>> t.equal_set([], a_set) + 0 + >>> t.equal_set([1, 2, 3], [3, 2, 1]) + 1 + """ + if len(a) != len(b): return 0 + if a == b: return 1 + return self.subset(a, b) and self.subset(b, a) + + def add_to(self, parent, additions): + "Modify _parent_ to include all elements in _additions_" + for x in additions: + if x not in parent: + parent.append(x) + self.changed() + + def equate(self, a, b): + """Extend (a) and (b) so that they contain each others' elements. + + >>> t = Generator('', [], [], []) + >>> a = [1, 2] + >>> b = [2, 3] + >>> t.equate(a, b) + >>> a + [1, 2, 3] + >>> b + [2, 3, 1] + """ + self.add_to(a, b) + self.add_to(b, a) + + def write(self, *args): + for a in args: + self.output.write(a) + + def in_test(self, expr, full, set): + """Generate a test of (expr) being in (set), where (set) is a subset of (full) + + expr is a string (Python expression) + set is a list of values (which will be converted with repr) + full is the list of all values expr could possibly evaluate to + + >>> t = Generator('', [], [], []) + >>> t.in_test('x', [1,2,3,4], []) + '0' + >>> t.in_test('x', [1,2,3,4], [1,2,3,4]) + '1' + >>> t.in_test('x', [1,2,3,4], [1]) + 'x == 1' + >>> t.in_test('a+b', [1,2,3,4], [1,2]) + 'a+b in [1, 2]' + >>> t.in_test('x', [1,2,3,4,5], [1,2,3]) + 'x not in [4, 5]' + >>> t.in_test('x', [1,2,3,4,5], [1,2,3,4]) + 'x != 5' + """ + + if not set: return '0' + if len(set) == 1: return '%s == %s' % (expr, repr(set[0])) + if full and len(set) > len(full)/2: + # Reverse the sense of the test. + not_set = [x for x in full if x not in set] + return self.not_in_test(expr, full, not_set) + return '%s in %s' % (expr, repr(set)) + + def not_in_test(self, expr, full, set): + """Like in_test, but the reverse test.""" + if not set: return '1' + if len(set) == 1: return '%s != %s' % (expr, repr(set[0])) + return '%s not in %s' % (expr, repr(set)) + + def peek_call(self, a): + """Generate a call to scan for a token in the set 'a'""" + assert type(a) == type([]) + a_set = (repr(a)[1:-1]) + if self.equal_set(a, self.non_ignored_tokens()): a_set = '' + if self.has_option('context-insensitive-scanner'): a_set = '' + return 'self._peek(%s)' % a_set + + def peek_test(self, a, b): + """Generate a call to test whether the next token (which could be any of + the elements in a) is in the set b.""" + if self.subset(a, b): return '1' + if self.has_option('context-insensitive-scanner'): a = self.non_ignored_tokens() + return self.in_test(self.peek_call(a), a, b) + + def not_peek_test(self, a, b): + """Like peek_test, but the opposite sense.""" + if self.subset(a, b): return '0' + return self.not_in_test(self.peek_call(a), a, b) + + def calculate(self): + """The main loop to compute the epsilon, first, follow sets. + The loop continues until the sets converge. This works because + each set can only get larger, so when they stop getting larger, + we're done.""" + # First we determine whether a rule accepts epsilon (the empty sequence) + while 1: + for r in self.goals: + self.rules[r].setup(self) + if self.change_count == 0: break + self.change_count = 0 + + # Now we compute the first/follow sets + while 1: + for r in self.goals: + self.rules[r].update(self) + if self.change_count == 0: break + self.change_count = 0 + + def dump_information(self): + """Display the grammar in somewhat human-readable form.""" + self.calculate() + for r in self.goals: + print ' _____' + '_'*len(r) + print ('___/Rule '+r+'\\' + '_'*80)[:79] + queue = [self.rules[r]] + while queue: + top = queue[0] + del queue[0] + + print 'Rule', repr(top), 'of class', top.__class__.__name__ + top.first.sort() + top.follow.sort() + eps = [] + if top.accepts_epsilon: eps = ['(null)'] + print ' FIRST:', ', '.join(top.first+eps) + print ' FOLLOW:', ', '.join(top.follow) + for x in top.get_children(): queue.append(x) + + def generate_output(self): + self.calculate() + self.write(self.preparser) + self.write("# Begin -- grammar generated by Yapps\n") + self.write("import sys, re\n") + self.write("import yappsrt\n") + self.write("\n") + self.write("class ", self.name, "Scanner(yappsrt.Scanner):\n") + self.write(" patterns = [\n") + for p in self.terminals: + self.write(" (%s, re.compile(%s)),\n" % ( + repr(p), repr(self.tokens[p]))) + self.write(" ]\n") + self.write(" def __init__(self, str):\n") + self.write(" yappsrt.Scanner.__init__(self,None,%s,str)\n" % + repr(self.ignore)) + self.write("\n") + + self.write("class ", self.name, "(yappsrt.Parser):\n") + self.write(INDENT, "Context = yappsrt.Context\n") + for r in self.goals: + self.write(INDENT, "def ", r, "(self") + if self.params[r]: self.write(", ", self.params[r]) + self.write(", _parent=None):\n") + self.write(INDENT+INDENT, "_context = self.Context(_parent, self._scanner, self._pos, %s, [%s])\n" % + (repr(r), self.params.get(r, ''))) + self.rules[r].output(self, INDENT+INDENT) + self.write("\n") + + self.write("\n") + self.write("def parse(rule, text):\n") + self.write(" P = ", self.name, "(", self.name, "Scanner(text))\n") + self.write(" return yappsrt.wrap_error_reporter(P, rule)\n") + self.write("\n") + if self.postparser is not None: + self.write("# End -- grammar generated by Yapps\n") + self.write(self.postparser) + else: + self.write("if __name__ == '__main__':\n") + self.write(INDENT, "from sys import argv, stdin\n") + self.write(INDENT, "if len(argv) >= 2:\n") + self.write(INDENT*2, "if len(argv) >= 3:\n") + self.write(INDENT*3, "f = open(argv[2],'r')\n") + self.write(INDENT*2, "else:\n") + self.write(INDENT*3, "f = stdin\n") + self.write(INDENT*2, "print parse(argv[1], f.read())\n") + self.write(INDENT, "else: print >>sys.stderr, 'Args: []'\n") + self.write("# End -- grammar generated by Yapps\n") + +###################################################################### +class Node: + """This is the base class for all components of a grammar.""" + def __init__(self, rule): + self.rule = rule # name of the rule containing this node + self.first = [] + self.follow = [] + self.accepts_epsilon = 0 + + def setup(self, gen): + # Setup will change accepts_epsilon, + # sometimes from 0 to 1 but never 1 to 0. + # It will take a finite number of steps to set things up + pass + + def used(self, vars): + "Return two lists: one of vars used, and the other of vars assigned" + return vars, [] + + def get_children(self): + "Return a list of sub-nodes" + return [] + + def __repr__(self): + return str(self) + + def update(self, gen): + if self.accepts_epsilon: + gen.add_to(self.first, self.follow) + + def output(self, gen, indent): + "Write out code to _gen_ with _indent_:string indentation" + gen.write(indent, "assert 0 # Invalid parser node\n") + +class Terminal(Node): + """This class stores terminal nodes, which are tokens.""" + def __init__(self, rule, token): + Node.__init__(self, rule) + self.token = token + self.accepts_epsilon = 0 + + def __str__(self): + return self.token + + def update(self, gen): + Node.update(self, gen) + if self.first != [self.token]: + self.first = [self.token] + gen.changed() + + def output(self, gen, indent): + gen.write(indent) + if re.match('[a-zA-Z_][a-zA-Z_0-9]*$', self.token): + gen.write(self.token, " = ") + gen.write("self._scan(%s)\n" % repr(self.token)) + +class Eval(Node): + """This class stores evaluation nodes, from {{ ... }} clauses.""" + def __init__(self, rule, expr): + Node.__init__(self, rule) + self.expr = expr + + def setup(self, gen): + Node.setup(self, gen) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '{{ %s }}' % self.expr.strip() + + def output(self, gen, indent): + gen.write(indent, self.expr.strip(), '\n') + +class NonTerminal(Node): + """This class stores nonterminal nodes, which are rules with arguments.""" + def __init__(self, rule, name, args): + Node.__init__(self, rule) + self.name = name + self.args = args + + def setup(self, gen): + Node.setup(self, gen) + try: + self.target = gen.rules[self.name] + if self.accepts_epsilon != self.target.accepts_epsilon: + self.accepts_epsilon = self.target.accepts_epsilon + gen.changed() + except KeyError: # Oops, it's nonexistent + print >>sys.stderr, 'Error: no rule <%s>' % self.name + self.target = self + + def __str__(self): + return '%s' % self.name + + def update(self, gen): + Node.update(self, gen) + gen.equate(self.first, self.target.first) + gen.equate(self.follow, self.target.follow) + + def output(self, gen, indent): + gen.write(indent) + gen.write(self.name, " = ") + args = self.args + if args: args += ', ' + args += '_context' + gen.write("self.", self.name, "(", args, ")\n") + +class Sequence(Node): + """This class stores a sequence of nodes (A B C ...)""" + def __init__(self, rule, *children): + Node.__init__(self, rule) + self.children = children + + def setup(self, gen): + Node.setup(self, gen) + for c in self.children: c.setup(gen) + + if not self.accepts_epsilon: + # If it's not already accepting epsilon, it might now do so. + for c in self.children: + # any non-epsilon means all is non-epsilon + if not c.accepts_epsilon: break + else: + self.accepts_epsilon = 1 + gen.changed() + + def get_children(self): + return self.children + + def __str__(self): + return '( %s )' % ' '.join(map(str, self.children)) + + def update(self, gen): + Node.update(self, gen) + for g in self.children: + g.update(gen) + + empty = 1 + for g_i in range(len(self.children)): + g = self.children[g_i] + + if empty: gen.add_to(self.first, g.first) + if not g.accepts_epsilon: empty = 0 + + if g_i == len(self.children)-1: + next = self.follow + else: + next = self.children[1+g_i].first + gen.add_to(g.follow, next) + + if self.children: + gen.add_to(self.follow, self.children[-1].follow) + + def output(self, gen, indent): + if self.children: + for c in self.children: + c.output(gen, indent) + else: + # Placeholder for empty sequences, just in case + gen.write(indent, 'pass\n') + +class Choice(Node): + """This class stores a choice between nodes (A | B | C | ...)""" + def __init__(self, rule, *children): + Node.__init__(self, rule) + self.children = children + + def setup(self, gen): + Node.setup(self, gen) + for c in self.children: c.setup(gen) + + if not self.accepts_epsilon: + for c in self.children: + if c.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def get_children(self): + return self.children + + def __str__(self): + return '( %s )' % ' | '.join(map(str, self.children)) + + def update(self, gen): + Node.update(self, gen) + for g in self.children: + g.update(gen) + + for g in self.children: + gen.add_to(self.first, g.first) + gen.add_to(self.follow, g.follow) + for g in self.children: + gen.add_to(g.follow, self.follow) + if self.accepts_epsilon: + gen.add_to(self.first, self.follow) + + def output(self, gen, indent): + test = "if" + gen.write(indent, "_token = ", gen.peek_call(self.first), "\n") + tokens_seen = [] + tokens_unseen = self.first[:] + if gen.has_option('context-insensitive-scanner'): + # Context insensitive scanners can return ANY token, + # not only the ones in first. + tokens_unseen = gen.non_ignored_tokens() + for c in self.children: + testset = c.first[:] + removed = [] + for x in testset: + if x in tokens_seen: + testset.remove(x) + removed.append(x) + if x in tokens_unseen: tokens_unseen.remove(x) + tokens_seen = tokens_seen + testset + if removed: + if not testset: + print >>sys.stderr, 'Error in rule', self.rule+':' + else: + print >>sys.stderr, 'Warning in rule', self.rule+':' + print >>sys.stderr, ' *', self + print >>sys.stderr, ' * These tokens could be matched by more than one clause:' + print >>sys.stderr, ' *', ' '.join(removed) + + if testset: + if not tokens_unseen: # context sensitive scanners only! + if test == 'if': + # if it's the first AND last test, then + # we can simply put the code without an if/else + c.output(gen, indent) + else: + gen.write(indent, "else:") + t = gen.in_test('', [], testset) + if len(t) < 70-len(indent): + gen.write(' #', t) + gen.write("\n") + c.output(gen, indent+INDENT) + else: + gen.write(indent, test, " ", + gen.in_test('_token', tokens_unseen, testset), + ":\n") + c.output(gen, indent+INDENT) + test = "elif" + + if tokens_unseen: + gen.write(indent, "else:\n") + gen.write(indent, INDENT, "raise yappsrt.SyntaxError(_token[0], ") + gen.write("'Could not match ", self.rule, "')\n") + +class Wrapper(Node): + """This is a base class for nodes that modify a single child.""" + def __init__(self, rule, child): + Node.__init__(self, rule) + self.child = child + + def setup(self, gen): + Node.setup(self, gen) + self.child.setup(gen) + + def get_children(self): + return [self.child] + + def update(self, gen): + Node.update(self, gen) + self.child.update(gen) + gen.add_to(self.first, self.child.first) + gen.equate(self.follow, self.child.follow) + +class Option(Wrapper): + """This class represents an optional clause of the form [A]""" + def setup(self, gen): + Wrapper.setup(self, gen) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '[ %s ]' % str(self.child) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print >>sys.stderr, 'Warning in rule', self.rule+': contents may be empty.' + gen.write(indent, "if %s:\n" % + gen.peek_test(self.first, self.child.first)) + self.child.output(gen, indent+INDENT) + +class Plus(Wrapper): + """This class represents a 1-or-more repetition clause of the form A+""" + def setup(self, gen): + Wrapper.setup(self, gen) + if self.accepts_epsilon != self.child.accepts_epsilon: + self.accepts_epsilon = self.child.accepts_epsilon + gen.changed() + + def __str__(self): + return '%s+' % str(self.child) + + def update(self, gen): + Wrapper.update(self, gen) + gen.add_to(self.child.follow, self.child.first) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print >>sys.stderr, 'Warning in rule', self.rule+':' + print >>sys.stderr, ' * The repeated pattern could be empty. The resulting parser may not work properly.' + gen.write(indent, "while 1:\n") + self.child.output(gen, indent+INDENT) + union = self.first[:] + gen.add_to(union, self.follow) + gen.write(indent+INDENT, "if %s: break\n" % + gen.not_peek_test(union, self.child.first)) + +class Star(Wrapper): + """This class represents a 0-or-more repetition clause of the form A*""" + def setup(self, gen): + Wrapper.setup(self, gen) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '%s*' % str(self.child) + + def update(self, gen): + Wrapper.update(self, gen) + gen.add_to(self.child.follow, self.child.first) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print >>sys.stderr, 'Warning in rule', self.rule+':' + print >>sys.stderr, ' * The repeated pattern could be empty. The resulting parser probably will not work properly.' + gen.write(indent, "while %s:\n" % + gen.peek_test(self.follow, self.child.first)) + self.child.output(gen, indent+INDENT) + + # TODO: need to generate tests like this in lots of rules + # TODO: do we need to do this only when it's a context-insensitive scanner? + gen.write(indent, "if %s:\n" % + gen.not_peek_test(gen.non_ignored_tokens(), self.follow)) + gen.write(indent+INDENT, "raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(%s))\n" % + repr(self.first)) + diff --git a/test/empty_clauses.g b/test/empty_clauses.g new file mode 100644 index 0000000..3b435f2 --- /dev/null +++ b/test/empty_clauses.g @@ -0,0 +1,10 @@ +# This parser tests the use of OR clauses with one of them being empty +# +# The output of --dump should indicate the FOLLOW set for (A | ) is 'c'. + +parser Test: + rule TestPlus: ( A | ) 'c' + rule A: 'a'+ + + rule TestStar: ( B | ) 'c' + rule B: 'b'* \ No newline at end of file diff --git a/test/line_numbers.g b/test/line_numbers.g new file mode 100644 index 0000000..19eee9a --- /dev/null +++ b/test/line_numbers.g @@ -0,0 +1,10 @@ +# +# The error messages produced by Yapps have a line number. +# The line number should take the Python code section into account. + +# The line number should be 10. + +%% + +parser error_1: + this_is_an_error; diff --git a/test/option.g b/test/option.g new file mode 100644 index 0000000..5980362 --- /dev/null +++ b/test/option.g @@ -0,0 +1,17 @@ + +%% + +parser test_option: + ignore: r'\s+' + token a: 'a' + token b: 'b' + token EOF: r'$' + + rule test_brackets: a [b] EOF + + rule test_question_mark: a b? EOF + +%% + +# The generated code for test_brackets and test_question_mark should +# be the same. diff --git a/yapps2.py b/yapps2.py new file mode 100755 index 0000000..6c4db20 --- /dev/null +++ b/yapps2.py @@ -0,0 +1,111 @@ +#!/usr/bin/python2 + +# +# Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# +# This version of Yapps 2 can be distributed under the +# terms of the MIT open source license, either found in the LICENSE file +# included with the Yapps distribution +# or at +# +# + +import sys, re + +import yappsrt, parsetree + +def generate(inputfilename, outputfilename='', dump=0, **flags): + """Generate a grammar, given an input filename (X.g) + and an output filename (defaulting to X.py).""" + + if not outputfilename: + if inputfilename.endswith('.g'): + outputfilename = inputfilename[:-2] + '.py' + else: + raise Exception('Must specify output filename if input filename is not *.g') + + DIVIDER = '\n%%\n' # This pattern separates the pre/post parsers + preparser, postparser = None, None # Code before and after the parser desc + + # Read the entire file + s = open(inputfilename,'r').read() + + # See if there's a separation between the pre-parser and parser + f = s.find(DIVIDER) + if f >= 0: preparser, s = s[:f]+'\n\n', s[f+len(DIVIDER):] + + # See if there's a separation between the parser and post-parser + f = s.find(DIVIDER) + if f >= 0: s, postparser = s[:f], '\n\n'+s[f+len(DIVIDER):] + + # Create the parser and scanner and parse the text + scanner = grammar.ParserDescriptionScanner(s) + if preparser: scanner.first_line_number = 1 + preparser.count('\n') + parser = grammar.ParserDescription(scanner) + t = yappsrt.wrap_error_reporter(parser, 'Parser') + if t is None: return # Failure + if preparser is not None: t.preparser = preparser + if postparser is not None: t.postparser = postparser + + # Check the options + for f in t.options.keys(): + for opt,_,_ in yapps_options: + if f == opt: break + else: + print >>sys.stderr, 'Warning: unrecognized option', f + # Add command line options to the set + for f in flags.keys(): t.options[f] = flags[f] + + # Generate the output + if dump: + t.dump_information() + else: + t.output = open(outputfilename, 'w') + t.generate_output() + +if __name__ == '__main__': + import doctest + doctest.testmod(sys.modules['__main__']) + doctest.testmod(parsetree) + + # Someday I will use optparse, but Python 2.3 is too new at the moment. + yapps_options = [ + ('context-insensitive-scanner', + 'context-insensitive-scanner', + 'Scan all tokens (see docs)'), + ] + + import getopt + optlist, args = getopt.getopt(sys.argv[1:], 'f:', ['help', 'dump', 'use-devel-grammar']) + if not args or len(args) > 2: + print >>sys.stderr, 'Usage:' + print >>sys.stderr, ' python', sys.argv[0], '[flags] input.g [output.py]' + print >>sys.stderr, 'Flags:' + print >>sys.stderr, (' --dump' + ' '*40)[:35] + 'Dump out grammar information' + print >>sys.stderr, (' --use-devel-grammar' + ' '*40)[:35] + 'Use the devel grammar parser from yapps_grammar.py instead of the stable grammar from grammar.py' + for flag, _, doc in yapps_options: + print >>sys.stderr, (' -f' + flag + ' '*40)[:35] + doc + else: + # Read in the options and create a list of flags + flags = {} + use_devel_grammar = 0 + for opt in optlist: + for flag, name, _ in yapps_options: + if opt == ('-f', flag): + flags[name] = 1 + break + else: + if opt == ('--dump', ''): + flags['dump'] = 1 + elif opt == ('--use-devel-grammar', ''): + use_devel_grammar = 1 + else: + print >>sys.stderr, 'Warning: unrecognized option', opt[0], opt[1] + + if use_devel_grammar: + import yapps_grammar as grammar + else: + import grammar + + generate(*tuple(args), **flags) diff --git a/yapps_grammar.g b/yapps_grammar.g new file mode 100755 index 0000000..d60ffe0 --- /dev/null +++ b/yapps_grammar.g @@ -0,0 +1,126 @@ +#!/usr/bin/python2 +# +# grammar.py, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# +# This version of the Yapps 2 grammar can be distributed under the +# terms of the MIT open source license, either found in the LICENSE +# file included with the Yapps distribution +# or at +# +# + +"""Parser for Yapps grammars. + +This file defines the grammar of Yapps grammars. Naturally, it is +implemented in Yapps. The grammar.py module needed by Yapps is built +by running Yapps on yapps_grammar.g. (Holy circularity, Batman!) + +""" + +import sys, re +import parsetree + +###################################################################### +def cleanup_choice(rule, lst): + if len(lst) == 0: return Sequence(rule, []) + if len(lst) == 1: return lst[0] + return parsetree.Choice(rule, *tuple(lst)) + +def cleanup_sequence(rule, lst): + if len(lst) == 1: return lst[0] + return parsetree.Sequence(rule, *tuple(lst)) + +def resolve_name(rule, tokens, id, args): + if id in [x[0] for x in tokens]: + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return parsetree.Terminal(rule, id) + else: + # It's a name, so assume it's a nonterminal + return parsetree.NonTerminal(rule, id, args) + +%% +parser ParserDescription: + option: "context-insensitive-scanner" + + ignore: "[ \t\r\n]+" + ignore: "#.*?\r?\n" + token EOF: "$" + token ATTR: "<<.+?>>" + token STMT: "{{.+?}}" + token ID: '[a-zA-Z_][a-zA-Z_0-9]*' + token STR: '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"' + token LP: '\\(' + token RP: '\\)' + token LB: '\\[' + token RB: '\\]' + token OR: '[|]' + token STAR: '[*]' + token PLUS: '[+]' + token QUEST: '[?]' + token COLON: ':' + + rule LINENO: # This is a pseudotoken. It matches nothing; returns the line number + {{ return 1 + self._scanner.get_input_scanned().count('\n') }} + + rule Parser: "parser" ID ":" + Options + Tokens + Rules<> + EOF + {{ return parsetree.Generator(ID,Options,Tokens,Rules) }} + + rule Options: {{ opt = {} }} + ( "option" ":" Str {{ opt[Str] = 1 }} )* + {{ return opt }} + + rule Tokens: {{ tok = [] }} + ( + "token" ID ":" Str {{ tok.append( (ID,Str) ) }} + | "ignore" ":" Str {{ tok.append( ('#ignore',Str) ) }} + )* + {{ return tok }} + + rule Rules<>: + {{ rul = [] }} + ( LINENO + "rule" ID OptParam ":" ClauseA<> + # TODO: save LINENO somewhere? + {{ rul.append( (ID, OptParam, ClauseA) ) }} + )* + {{ return rul }} + + rule ClauseA<>: + ClauseB<> + {{ v = [ClauseB] }} + ( OR ClauseB<> {{ v.append(ClauseB) }} )* + {{ return cleanup_choice(rule, v) }} + + rule ClauseB<>: + {{ v = [] }} + ( ClauseC<> {{ v.append(ClauseC) }} )* + {{ return cleanup_sequence(rule, v) }} + + rule ClauseC<>: + ClauseD<> + ( PLUS {{ return parsetree.Plus(rule, ClauseD) }} + | STAR {{ return parsetree.Star(rule, ClauseD) }} + | QUEST {{ return parsetree.Option(rule, ClauseD) }} + | {{ return ClauseD }} ) + + rule ClauseD<>: + STR {{ t = (STR, eval(STR,{},{})) }} + {{ if t not in tokens: tokens.insert( 0, t ) }} + {{ return parsetree.Terminal(rule, STR) }} + | ID OptParam {{ return resolve_name(rule, tokens, ID, OptParam) }} + | LP ClauseA<> RP {{ return ClauseA }} + | LB ClauseA<> RB {{ return parsetree.Option(rule, ClauseA) }} + | STMT {{ return parsetree.Eval(rule, STMT[2:-2]) }} + + rule OptParam: + ATTR {{ return ATTR[2:-2] }} + | {{ return '' }} + rule Str: STR {{ return eval(STR,{},{}) }} +%% diff --git a/yapps_grammar.py b/yapps_grammar.py new file mode 100644 index 0000000..3bd2a86 --- /dev/null +++ b/yapps_grammar.py @@ -0,0 +1,234 @@ +#!/usr/bin/python2 +# +# grammar.py, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# +# This version of the Yapps 2 grammar can be distributed under the +# terms of the MIT open source license, either found in the LICENSE +# file included with the Yapps distribution +# or at +# +# + +"""Parser for Yapps grammars. + +This file defines the grammar of Yapps grammars. Naturally, it is +implemented in Yapps. The grammar.py module needed by Yapps is built +by running Yapps on yapps_grammar.g. (Holy circularity, Batman!) + +""" + +import sys, re +import parsetree + +###################################################################### +def cleanup_choice(rule, lst): + if len(lst) == 0: return Sequence(rule, []) + if len(lst) == 1: return lst[0] + return parsetree.Choice(rule, *tuple(lst)) + +def cleanup_sequence(rule, lst): + if len(lst) == 1: return lst[0] + return parsetree.Sequence(rule, *tuple(lst)) + +def resolve_name(rule, tokens, id, args): + if id in [x[0] for x in tokens]: + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return parsetree.Terminal(rule, id) + else: + # It's a name, so assume it's a nonterminal + return parsetree.NonTerminal(rule, id, args) + + +# Begin -- grammar generated by Yapps +import sys, re +import yappsrt + +class ParserDescriptionScanner(yappsrt.Scanner): + patterns = [ + ('"rule"', re.compile('rule')), + ('"ignore"', re.compile('ignore')), + ('"token"', re.compile('token')), + ('"option"', re.compile('option')), + ('":"', re.compile(':')), + ('"parser"', re.compile('parser')), + ('[ \t\r\n]+', re.compile('[ \t\r\n]+')), + ('#.*?\r?\n', re.compile('#.*?\r?\n')), + ('EOF', re.compile('$')), + ('ATTR', re.compile('<<.+?>>')), + ('STMT', re.compile('{{.+?}}')), + ('ID', re.compile('[a-zA-Z_][a-zA-Z_0-9]*')), + ('STR', re.compile('[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"')), + ('LP', re.compile('\\(')), + ('RP', re.compile('\\)')), + ('LB', re.compile('\\[')), + ('RB', re.compile('\\]')), + ('OR', re.compile('[|]')), + ('STAR', re.compile('[*]')), + ('PLUS', re.compile('[+]')), + ('QUEST', re.compile('[?]')), + ('COLON', re.compile(':')), + ] + def __init__(self, str): + yappsrt.Scanner.__init__(self,None,['[ \t\r\n]+', '#.*?\r?\n'],str) + +class ParserDescription(yappsrt.Parser): + Context = yappsrt.Context + def LINENO(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'LINENO', []) + return 1 + self._scanner.get_input_scanned().count('\n') + + def Parser(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'Parser', []) + self._scan('"parser"') + ID = self._scan('ID') + self._scan('":"') + Options = self.Options(_context) + Tokens = self.Tokens(_context) + Rules = self.Rules(Tokens, _context) + EOF = self._scan('EOF') + return parsetree.Generator(ID,Options,Tokens,Rules) + + def Options(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'Options', []) + opt = {} + while self._peek() == '"option"': + self._scan('"option"') + self._scan('":"') + Str = self.Str(_context) + opt[Str] = 1 + if self._peek() not in ['"option"', '"token"', '"ignore"', 'EOF', '"rule"']: + raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['"option"', '"token"', '"ignore"', 'EOF', '"rule"'])) + return opt + + def Tokens(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'Tokens', []) + tok = [] + while self._peek() in ['"token"', '"ignore"']: + _token = self._peek() + if _token == '"token"': + self._scan('"token"') + ID = self._scan('ID') + self._scan('":"') + Str = self.Str(_context) + tok.append( (ID,Str) ) + elif _token == '"ignore"': + self._scan('"ignore"') + self._scan('":"') + Str = self.Str(_context) + tok.append( ('#ignore',Str) ) + else: + raise yappsrt.SyntaxError(_token[0], 'Could not match Tokens') + if self._peek() not in ['"token"', '"ignore"', 'EOF', '"rule"']: + raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['"token"', '"ignore"', 'EOF', '"rule"'])) + return tok + + def Rules(self, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'Rules', [tokens]) + rul = [] + while self._peek() == '"rule"': + LINENO = self.LINENO(_context) + self._scan('"rule"') + ID = self._scan('ID') + OptParam = self.OptParam(_context) + self._scan('":"') + ClauseA = self.ClauseA(ID, tokens, _context) + rul.append( (ID, OptParam, ClauseA) ) + if self._peek() not in ['"rule"', 'EOF']: + raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['"rule"', 'EOF'])) + return rul + + def ClauseA(self, rule, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'ClauseA', [rule, tokens]) + ClauseB = self.ClauseB(rule, tokens, _context) + v = [ClauseB] + while self._peek() == 'OR': + OR = self._scan('OR') + ClauseB = self.ClauseB(rule, tokens, _context) + v.append(ClauseB) + if self._peek() not in ['OR', 'RP', 'RB', '"rule"', 'EOF']: + raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['OR', 'RP', 'RB', '"rule"', 'EOF'])) + return cleanup_choice(rule, v) + + def ClauseB(self, rule, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'ClauseB', [rule, tokens]) + v = [] + while self._peek() in ['STR', 'ID', 'LP', 'LB', 'STMT']: + ClauseC = self.ClauseC(rule, tokens, _context) + v.append(ClauseC) + if self._peek() not in ['STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'EOF']: + raise yappsrt.SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'EOF'])) + return cleanup_sequence(rule, v) + + def ClauseC(self, rule, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'ClauseC', [rule, tokens]) + ClauseD = self.ClauseD(rule, tokens, _context) + _token = self._peek() + if _token == 'PLUS': + PLUS = self._scan('PLUS') + return parsetree.Plus(rule, ClauseD) + elif _token == 'STAR': + STAR = self._scan('STAR') + return parsetree.Star(rule, ClauseD) + elif _token == 'QUEST': + QUEST = self._scan('QUEST') + return parsetree.Option(rule, ClauseD) + elif _token not in ['"ignore"', '"token"', '"option"', '":"', '"parser"', 'ATTR', 'COLON']: + return ClauseD + else: + raise yappsrt.SyntaxError(_token[0], 'Could not match ClauseC') + + def ClauseD(self, rule, tokens, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'ClauseD', [rule, tokens]) + _token = self._peek() + if _token == 'STR': + STR = self._scan('STR') + t = (STR, eval(STR,{},{})) + if t not in tokens: tokens.insert( 0, t ) + return parsetree.Terminal(rule, STR) + elif _token == 'ID': + ID = self._scan('ID') + OptParam = self.OptParam(_context) + return resolve_name(rule, tokens, ID, OptParam) + elif _token == 'LP': + LP = self._scan('LP') + ClauseA = self.ClauseA(rule, tokens, _context) + RP = self._scan('RP') + return ClauseA + elif _token == 'LB': + LB = self._scan('LB') + ClauseA = self.ClauseA(rule, tokens, _context) + RB = self._scan('RB') + return parsetree.Option(rule, ClauseA) + elif _token == 'STMT': + STMT = self._scan('STMT') + return parsetree.Eval(rule, STMT[2:-2]) + else: + raise yappsrt.SyntaxError(_token[0], 'Could not match ClauseD') + + def OptParam(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'OptParam', []) + _token = self._peek() + if _token == 'ATTR': + ATTR = self._scan('ATTR') + return ATTR[2:-2] + elif _token not in ['"ignore"', '"token"', '"option"', '"parser"', 'COLON']: + return '' + else: + raise yappsrt.SyntaxError(_token[0], 'Could not match OptParam') + + def Str(self, _parent=None): + _context = self.Context(_parent, self._scanner, self._pos, 'Str', []) + STR = self._scan('STR') + return eval(STR,{},{}) + + +def parse(rule, text): + P = ParserDescription(ParserDescriptionScanner(text)) + return yappsrt.wrap_error_reporter(P, rule) + +# End -- grammar generated by Yapps + + diff --git a/yappsrt.py b/yappsrt.py new file mode 100644 index 0000000..7485ca7 --- /dev/null +++ b/yappsrt.py @@ -0,0 +1,304 @@ +# +# Yapps 2 Runtime, part of Yapps 2 - yet another python parser system +# Copyright 1999-2003 by Amit J. Patel +# +# This version of the Yapps 2 Runtime can be distributed under the +# terms of the MIT open source license, either found in the LICENSE file +# included with the Yapps distribution +# or at +# +# + +"""Run time libraries needed to run parsers generated by Yapps. + +This module defines parse-time exception classes, a scanner class, a +base class for parsers produced by Yapps, and a context class that +keeps track of the parse stack. + +""" + +# TODO: it should be possible to embed yappsrt into the generated +# grammar to make a standalone module. + +import sys, re + +class SyntaxError(Exception): + """When we run into an unexpected token, this is the exception to use""" + def __init__(self, charpos=-1, msg="Bad Token", context=None): + Exception.__init__(self) + self.charpos = charpos + self.msg = msg + self.context = context + + def __str__(self): + if self.charpos < 0: return 'SyntaxError' + else: return 'SyntaxError@char%s(%s)' % (repr(self.charpos), self.msg) + +class NoMoreTokens(Exception): + """Another exception object, for when we run out of tokens""" + pass + +class Scanner: + """Yapps scanner. + + The Yapps scanner can work in context sensitive or context + insensitive modes. The token(i) method is used to retrieve the + i-th token. It takes a restrict set that limits the set of tokens + it is allowed to return. In context sensitive mode, this restrict + set guides the scanner. In context insensitive mode, there is no + restriction (the set is always the full set of tokens). + + """ + + def __init__(self, patterns, ignore, input): + """Initialize the scanner. + + Parameters: + patterns : [(terminal, uncompiled regex), ...] or None + ignore : [terminal,...] + input : string + + If patterns is None, we assume that the subclass has + defined self.patterns : [(terminal, compiled regex), ...]. + Note that the patterns parameter expects uncompiled regexes, + whereas the self.patterns field expects compiled regexes. + """ + self.tokens = [] # [(begin char pos, end char pos, token name, matched text), ...] + self.restrictions = [] + self.input = input + self.pos = 0 + self.ignore = ignore + self.first_line_number = 1 + + if patterns is not None: + # Compile the regex strings into regex objects + self.patterns = [] + for terminal, regex in patterns: + self.patterns.append( (terminal, re.compile(regex)) ) + + def get_token_pos(self): + """Get the current token position in the input text.""" + return len(self.tokens) + + def get_char_pos(self): + """Get the current char position in the input text.""" + return self.pos + + def get_prev_char_pos(self, i=None): + """Get the previous position (one token back) in the input text.""" + if self.pos == 0: return 0 + if i is None: i = -1 + return self.tokens[i][0] + + def get_line_number(self): + """Get the line number of the current position in the input text.""" + # TODO: make this work at any token/char position + return self.first_line_number + self.get_input_scanned().count('\n') + + def get_column_number(self): + """Get the column number of the current position in the input text.""" + s = self.get_input_scanned() + i = s.rfind('\n') # may be -1, but that's okay in this case + return len(s) - (i+1) + + def get_input_scanned(self): + """Get the portion of the input that has been tokenized.""" + return self.input[:self.pos] + + def get_input_unscanned(self): + """Get the portion of the input that has not yet been tokenized.""" + return self.input[self.pos:] + + def token(self, i, restrict=None): + """Get the i'th token in the input. + + If i is one past the end, then scan for another token. + + Args: + + restrict : [token, ...] or None; if restrict is None, then any + token is allowed. You may call token(i) more than once. + However, the restrict set may never be larger than what was + passed in on the first call to token(i). + + """ + if i == len(self.tokens): + self.scan(restrict) + if i < len(self.tokens): + # Make sure the restriction is more restricted. This + # invariant is needed to avoid ruining tokenization at + # position i+1 and higher. + if restrict and self.restrictions[i]: + for r in restrict: + if r not in self.restrictions[i]: + raise NotImplementedError("Unimplemented: restriction set changed") + return self.tokens[i] + raise NoMoreTokens() + + def __repr__(self): + """Print the last 10 tokens that have been scanned in""" + output = '' + for t in self.tokens[-10:]: + output = '%s\n (@%s) %s = %s' % (output,t[0],t[2],repr(t[3])) + return output + + def scan(self, restrict): + """Should scan another token and add it to the list, self.tokens, + and add the restriction to self.restrictions""" + # Keep looking for a token, ignoring any in self.ignore + while 1: + # Search the patterns for the longest match, with earlier + # tokens in the list having preference + best_match = -1 + best_pat = '(error)' + for p, regexp in self.patterns: + # First check to see if we're ignoring this token + if restrict and p not in restrict and p not in self.ignore: + continue + m = regexp.match(self.input, self.pos) + if m and len(m.group(0)) > best_match: + # We got a match that's better than the previous one + best_pat = p + best_match = len(m.group(0)) + + # If we didn't find anything, raise an error + if best_pat == '(error)' and best_match < 0: + msg = 'Bad Token' + if restrict: + msg = 'Trying to find one of '+', '.join(restrict) + raise SyntaxError(self.pos, msg) + + # If we found something that isn't to be ignored, return it + if best_pat not in self.ignore: + # Create a token with this data + token = (self.pos, self.pos+best_match, best_pat, + self.input[self.pos:self.pos+best_match]) + self.pos = self.pos + best_match + # Only add this token if it's not in the list + # (to prevent looping) + if not self.tokens or token != self.tokens[-1]: + self.tokens.append(token) + self.restrictions.append(restrict) + return + else: + # This token should be ignored .. + self.pos = self.pos + best_match + +class Parser: + """Base class for Yapps-generated parsers. + + """ + + def __init__(self, scanner): + self._scanner = scanner + self._pos = 0 + + def _peek(self, *types): + """Returns the token type for lookahead; if there are any args + then the list of args is the set of token types to allow""" + tok = self._scanner.token(self._pos, types) + return tok[2] + + def _scan(self, type): + """Returns the matched text, and moves to the next token""" + tok = self._scanner.token(self._pos, [type]) + if tok[2] != type: + raise SyntaxError(tok[0], 'Trying to find '+type+' :'+ ' ,'.join(self._scanner.restrictions[self._pos])) + self._pos = 1 + self._pos + return tok[3] + +class Context: + """Class to represent the parser's call stack. + + Every rule creates a Context that links to its parent rule. The + contexts can be used for debugging. + + """ + + def __init__(self, parent, scanner, tokenpos, rule, args=()): + """Create a new context. + + Args: + parent: Context object or None + scanner: Scanner object + pos: integer (scanner token position) + rule: string (name of the rule) + args: tuple listing parameters to the rule + + """ + self.parent = parent + self.scanner = scanner + self.tokenpos = tokenpos + self.rule = rule + self.args = args + + def __str__(self): + output = '' + if self.parent: output = str(self.parent) + ' > ' + output += self.rule + return output + +def print_line_with_pointer(text, p): + """Print the line of 'text' that includes position 'p', + along with a second line with a single caret (^) at position p""" + + # TODO: separate out the logic for determining the line/character + # location from the logic for determining how to display an + # 80-column line to stderr. + + # Now try printing part of the line + text = text[max(p-80, 0):p+80] + p = p - max(p-80, 0) + + # Strip to the left + i = text[:p].rfind('\n') + j = text[:p].rfind('\r') + if i < 0 or (0 <= j < i): i = j + if 0 <= i < p: + p = p - i - 1 + text = text[i+1:] + + # Strip to the right + i = text.find('\n', p) + j = text.find('\r', p) + if i < 0 or (0 <= j < i): i = j + if i >= 0: + text = text[:i] + + # Now shorten the text + while len(text) > 70 and p > 60: + # Cut off 10 chars + text = "..." + text[10:] + p = p - 7 + + # Now print the string, along with an indicator + print >>sys.stderr, '> ',text + print >>sys.stderr, '> ',' '*p + '^' + +def print_error(input, err, scanner): + """Print error messages, the parser stack, and the input text -- for human-readable error messages.""" + # NOTE: this function assumes 80 columns :-( + # Figure out the line number + line_number = scanner.get_line_number() + column_number = scanner.get_column_number() + print >>sys.stderr, '%d:%d: %s' % (line_number, column_number, err.msg) + + context = err.context + if not context: + print_line_with_pointer(input, err.charpos) + + while context: + # TODO: add line number + print >>sys.stderr, 'while parsing %s%s:' % (context.rule, tuple(context.args)) + print_line_with_pointer(input, context.scanner.get_prev_char_pos(context.tokenpos)) + context = context.parent + +def wrap_error_reporter(parser, rule): + try: + return getattr(parser, rule)() + except SyntaxError, e: + input = parser._scanner.input + print_error(input, e, parser._scanner) + except NoMoreTokens: + print >>sys.stderr, 'Could not complete parsing; stopped around here:' + print >>sys.stderr, parser._scanner