diff options
author | Tomas Kukosa <tomas.kukosa@siemens.com> | 2006-10-09 06:24:03 +0000 |
---|---|---|
committer | Tomas Kukosa <tomas.kukosa@siemens.com> | 2006-10-09 06:24:03 +0000 |
commit | 410830c4e3e47fb506c2d792ab10f9ddd38e747d (patch) | |
tree | 2f401b4d2d304d85bf6ed7dd8a28df2fa5a0ed7e /tools | |
parent | cf201496162307772c31c19bba651bcbbf6e5be4 (diff) |
Ply updated to version 2.1
svn path=/trunk/; revision=19459
Diffstat (limited to 'tools')
-rwxr-xr-x | tools/lex.py | 398 | ||||
-rwxr-xr-x | tools/yacc.py | 1057 |
2 files changed, 653 insertions, 802 deletions
diff --git a/tools/lex.py b/tools/lex.py index 15af21357b..beaace02df 100755 --- a/tools/lex.py +++ b/tools/lex.py @@ -5,8 +5,6 @@ # # Copyright (C) 2001-2006, David M. Beazley # -# $Header: /cvs/projects/PLY/lex.py,v 1.1.1.1 2004/05/21 15:34:10 beazley Exp $ -# # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either @@ -41,20 +39,6 @@ # - The interface is geared towards LALR(1) and LR(1) parser # generators. That is tokens are generated one at a time # rather than being generated in advanced all in one step. -# -# There are a few limitations of this module -# -# - The module interface makes it somewhat awkward to support more -# than one lexer at a time. Although somewhat inelegant from a -# design perspective, this is rarely a practical concern for -# most compiler projects. -# -# - The lexer requires that the entire input text be read into -# a string before scanning. I suppose that most machines have -# enough memory to make this a minor issues, but it makes -# the lexer somewhat difficult to use in interactive sessions -# or with streaming data. -# #----------------------------------------------------------------------------- r""" @@ -186,18 +170,19 @@ scanner you have defined. # ----------------------------------------------------------------------------- -__version__ = "1.8" +__version__ = "2.1" import re, types, sys, copy # Available instance types. This is used when lexers are defined by a class. -# it's a little funky because I want to preserve backwards compatibility +# It's a little funky because I want to preserve backwards compatibility # with Python 2.0 where types.ObjectType is undefined. try: _INSTANCETYPE = (types.InstanceType, types.ObjectType) except AttributeError: _INSTANCETYPE = types.InstanceType + class object: pass # Note: needed if no new-style classes present # Exception thrown when invalid token encountered and no default class LexError(Exception): @@ -206,9 +191,9 @@ class LexError(Exception): self.text = s # Token class -class LexToken: +class LexToken(object): def __str__(self): - return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno) + return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) def __repr__(self): return str(self) def skip(self,n): @@ -220,42 +205,66 @@ class LexToken: # ----------------------------------------------------------------------------- # Lexer class # +# This class encapsulates all of the methods and data associated with a lexer. +# # input() - Store a new string in the lexer # token() - Get the next token # ----------------------------------------------------------------------------- class Lexer: def __init__(self): - self.lexre = None # Master regular expression - self.lexreflags = 0 # Option re compile flags - self.lexdata = None # Actual input data (as a string) - self.lexpos = 0 # Current position in input text - self.lexlen = 0 # Length of the input text - self.lexindexfunc = [ ] # Reverse mapping of groups to functions and types - self.lexerrorf = None # Error rule (if any) - self.lextokens = None # List of valid tokens - self.lexignore = None # Ignored characters - self.lineno = 1 # Current line number - self.debug = 0 # Debugging mode - self.optimize = 0 # Optimized mode - self.token = self.errtoken - - def __copy__(self): + self.lexre = None # Master regular expression. This is a list of + # tuples (re,findex) where re is a compiled + # regular expression and findex is a list + # mapping regex group numbers to rules + + self.lexreflags = 0 # Optional re compile flags + self.lexdata = None # Actual input data (as a string) + self.lexpos = 0 # Current position in input text + self.lexlen = 0 # Length of the input text + self.lexerrorf = None # Error rule (if any) + self.lextokens = None # List of valid tokens + self.lexignore = None # Ignored characters + self.lexliterals = "" # Literal characters that can be passed through + self.lexmodule = None # Module + self.lineno = 1 # Current line number + self.lexdebug = 0 # Debugging mode + self.lexoptimize = 0 # Optimized mode + + def clone(self,object=None): c = Lexer() c.lexre = self.lexre - c.lexreflags = self.lexreflags + c.lexreflags = self.lexreflags c.lexdata = self.lexdata c.lexpos = self.lexpos c.lexlen = self.lexlen - c.lexindexfunc = self.lexindexfunc c.lexerrorf = self.lexerrorf c.lextokens = self.lextokens c.lexignore = self.lexignore - c.debug = self.debug + c.lexdebug = self.lexdebug c.lineno = self.lineno - c.optimize = self.optimize - c.token = c.realtoken - return c + c.lexoptimize = self.lexoptimize + c.lexliterals = self.lexliterals + c.lexmodule = self.lexmodule + + # If the object parameter has been supplied, it means we are attaching the + # lexer to a new object. In this case, we have to rebind the methods + + if object: + newre = [] + for cre, findex in c.lexre: + # Loop over findex and adjust methods + newfindex = [] + for f in findex: + if not f or not f[0]: + newfindex.append(f) + continue + newfindex.append((getattr(object,f[0].__name__),f[1])) + newre.append((cre,newfindex)) + c.lexre = newre + c.lexmodule = object + + return c # ------------------------------------------------------------ # input() - Push a new string into the lexer @@ -266,33 +275,21 @@ class Lexer: self.lexdata = s self.lexpos = 0 self.lexlen = len(s) - self.token = self.realtoken - - # Change the token routine to point to realtoken() - global token - if token == self.errtoken: - token = self.token # ------------------------------------------------------------ - # errtoken() - Return error if token is called with no data - # ------------------------------------------------------------ - def errtoken(self): - raise RuntimeError, "No input string given with input()" - - # ------------------------------------------------------------ # token() - Return the next token from the Lexer # # Note: This function has been carefully implemented to be as fast # as possible. Don't make changes unless you really know what # you are doing # ------------------------------------------------------------ - def realtoken(self): + def token(self): # Make local copies of frequently referenced attributes lexpos = self.lexpos lexlen = self.lexlen lexignore = self.lexignore lexdata = self.lexdata - + while lexpos < lexlen: # This code provides some short-circuit code for whitespace, tabs, and other ignored characters if lexdata[lexpos] in lexignore: @@ -300,61 +297,78 @@ class Lexer: continue # Look for a regular expression match - m = self.lexre.match(lexdata,lexpos) - if m: - i = m.lastindex - lexpos = m.end() + for lexre,lexindexfunc in self.lexre: + m = lexre.match(lexdata,lexpos) + if not m: continue + tok = LexToken() tok.value = m.group() tok.lineno = self.lineno + tok.lexpos = lexpos tok.lexer = self - func,tok.type = self.lexindexfunc[i] + + lexpos = m.end() + i = m.lastindex + func,tok.type = lexindexfunc[i] + self.lexpos = lexpos + if not func: - self.lexpos = lexpos return tok # If token is processed by a function, call it - self.lexpos = lexpos newtok = func(tok) - self.lineno = tok.lineno # Update line number # Every function must return a token, if nothing, we just move to next token - if not newtok: continue + if not newtok: + lexpos = self.lexpos # This is here in case user has updated lexpos. + break # Verify type of the token. If not in the token map, raise an error - if not self.optimize: + if not self.lexoptimize: if not self.lextokens.has_key(newtok.type): raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( func.func_code.co_filename, func.func_code.co_firstlineno, func.__name__, newtok.type),lexdata[lexpos:]) return newtok - - # No match. Call t_error() if defined. - if self.lexerrorf: - tok = LexToken() - tok.value = self.lexdata[lexpos:] - tok.lineno = self.lineno - tok.type = "error" - tok.lexer = self - oldpos = lexpos - newtok = self.lexerrorf(tok) - lexpos += getattr(tok,"_skipn",0) - if oldpos == lexpos: - # Error method didn't change text position at all. This is an error. + else: + # No match, see if in literals + if lexdata[lexpos] in self.lexliterals: + tok = LexToken() + tok.value = lexdata[lexpos] + tok.lineno = self.lineno + tok.lexer = self + tok.type = tok.value + tok.lexpos = lexpos + self.lexpos = lexpos + 1 + return tok + + # No match. Call t_error() if defined. + if self.lexerrorf: + tok = LexToken() + tok.value = self.lexdata[lexpos:] + tok.lineno = self.lineno + tok.type = "error" + tok.lexer = self + tok.lexpos = lexpos + oldpos = lexpos + newtok = self.lexerrorf(tok) + lexpos += getattr(tok,"_skipn",0) + if oldpos == lexpos: + # Error method didn't change text position at all. This is an error. + self.lexpos = lexpos + raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) + if not newtok: continue self.lexpos = lexpos - raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) - if not newtok: continue - self.lexpos = lexpos - return newtok + return newtok - self.lexpos = lexpos - raise LexError, ("No match found", lexdata[lexpos:]) + self.lexpos = lexpos + raise LexError, ("No match found", lexdata[lexpos:]) - # No more input data self.lexpos = lexpos + 1 + if self.lexdata is None: + raise RuntimeError, "No input string given with input()" return None - # ----------------------------------------------------------------------------- # validate_file() @@ -404,34 +418,74 @@ def validate_file(filename): def _read_lextab(lexer, fdict, module): exec "import %s as lextab" % module - lexer.lexre = re.compile(lextab._lexre, re.VERBOSE | lextab._lexreflags) + lexre = [] + for regex,ltab in lextab._lexre: + ftab = [] + for t in ltab: + if t and t[0]: + ftab.append((fdict[t[0]], t[1])) + else: + ftab.append(t) + lexre.append((re.compile(regex, re.VERBOSE | lextab._lexreflags), ftab)) + lexer.lexre = lexre lexer.lexreflags = lextab._lexreflags - lexer.lexindexfunc = lextab._lextab - for i in range(len(lextab._lextab)): - t = lexer.lexindexfunc[i] - if t: - if t[0]: - lexer.lexindexfunc[i] = (fdict[t[0]],t[1]) lexer.lextokens = lextab._lextokens lexer.lexignore = lextab._lexignore if lextab._lexerrorf: lexer.lexerrorf = fdict[lextab._lexerrorf] - + lexer.lexliterals = lextab._lexliterals + +# ----------------------------------------------------------------------------- +# _form_master_re() +# +# This function takes a list of all of the regex components and attempts to +# form the master regular expression. Given limitations in the Python re +# module, it may be necessary to break the master regex into separate expressions. +# ----------------------------------------------------------------------------- + +def _form_master_re(relist,reflags,ldict): + if not relist: return [] + regex = "|".join(relist) + try: + lexre = re.compile(regex,re.VERBOSE | reflags) + + # Build the index to function map for the matching engine + lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) + for f,i in lexre.groupindex.items(): + handle = ldict.get(f,None) + if type(handle) in (types.FunctionType, types.MethodType): + lexindexfunc[i] = (handle,handle.__name__[2:]) + elif handle is not None: + # If rule was specified as a string, we build an anonymous + # callback function to carry out the action + lexindexfunc[i] = (None,f[2:]) + + return [(lexre,lexindexfunc)],[regex] + except Exception,e: + m = int(len(relist)/2) + if m == 0: m = 1 + llist, lre = _form_master_re(relist[:m],reflags,ldict) + rlist, rre = _form_master_re(relist[m:],reflags,ldict) + return llist+rlist, lre+rre + # ----------------------------------------------------------------------------- # lex(module) # # Build all of the regular expression rules from definitions in the supplied module # ----------------------------------------------------------------------------- -def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): +def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0): + global lexer ldict = None - regex = "" + regex_list = [] error = 0 files = { } - lexer = Lexer() - lexer.debug = debug - lexer.optimize = optimize + lexobj = Lexer() + lexobj.lexdebug = debug + lexobj.lexoptimize = optimize global token,input + if object: module = object + if module: # User supplied a module object. if isinstance(module, types.ModuleType): @@ -443,6 +497,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): ldict[i] = v else: raise ValueError,"Expected a module or instance" + lexobj.lexmodule = module else: # No module given. We might be able to get information from the caller. @@ -456,11 +511,12 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): if optimize and lextab: try: - _read_lextab(lexer,ldict, lextab) - if not lexer.lexignore: lexer.lexignore = "" - token = lexer.token - input = lexer.input - return lexer + _read_lextab(lexobj,ldict, lextab) + if not lexobj.lexignore: lexobj.lexignore = "" + token = lexobj.token + input = lexobj.input + lexer = lexobj + return lexobj except ImportError: pass @@ -480,7 +536,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): raise SyntaxError,"lex: tokens must be a list or tuple." # Build a dictionary of valid token names - lexer.lextokens = { } + lexobj.lextokens = { } if not optimize: # Utility function for verifying tokens @@ -493,15 +549,26 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): if not is_identifier(n): print "lex: Bad token name '%s'" % n error = 1 - if lexer.lextokens.has_key(n): + if lexobj.lextokens.has_key(n): print "lex: Warning. Token '%s' multiply defined." % n - lexer.lextokens[n] = None + lexobj.lextokens[n] = None else: - for n in tokens: lexer.lextokens[n] = None + for n in tokens: lexobj.lextokens[n] = None if debug: - print "lex: tokens = '%s'" % lexer.lextokens.keys() + print "lex: tokens = '%s'" % lexobj.lextokens.keys() + + # Get literals + if (module and isinstance(module,_INSTANCETYPE)): + literals = getattr(module,"literals","") + else: + try: + literals = ldict["literals"] + except KeyError: + literals = "" + + lexobj.lexliterals = literals # Get a list of symbols with the t_ prefix tsymbols = [f for f in ldict.keys() if f[:2] == 't_'] @@ -559,15 +626,21 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): continue if f.__name__ == 't_error': - lexer.lexerrorf = f + lexobj.lexerrorf = f continue if f.__doc__: if not optimize: try: - c = re.compile(f.__doc__, re.VERBOSE | reflags) + c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags) + if c.match(""): + print "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__) + error = 1 + continue except re.error,e: print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) + if '#' in f.__doc__: + print "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__) error = 1 continue @@ -577,8 +650,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): # Okay. The regular expression seemed okay. Let's append it to the master regular # expression we're building - if (regex): regex += "|" - regex += "(?P<%s>%s)" % (f.__name__,f.__doc__) + regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__)) else: print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) @@ -586,7 +658,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): for name,r in ssymbols: if name == 't_ignore': - lexer.lexignore = r + lexobj.lexignore = r continue if not optimize: @@ -595,41 +667,38 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): error = 1 continue - if not lexer.lextokens.has_key(name[2:]): + if not lexobj.lextokens.has_key(name[2:]): print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:]) error = 1 continue try: - c = re.compile(r,re.VERBOSE) + c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags) + if (c.match("")): + print "lex: Regular expression for rule '%s' matches empty string." % name + error = 1 + continue except re.error,e: print "lex: Invalid regular expression for rule '%s'. %s" % (name,e) + if '#' in r: + print "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name + error = 1 continue if debug: print "lex: Adding rule %s -> '%s'" % (name,r) - if regex: regex += "|" - regex += "(?P<%s>%s)" % (name,r) + regex_list.append("(?P<%s>%s)" % (name,r)) if not optimize: for f in files.keys(): if not validate_file(f): error = 1 try: - if debug: - print "lex: regex = '%s'" % regex - lexer.lexre = re.compile(regex, re.VERBOSE | reflags) - # Build the index to function map for the matching engine - lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1) - for f,i in lexer.lexre.groupindex.items(): - handle = ldict[f] - if type(handle) in (types.FunctionType, types.MethodType): - lexer.lexindexfunc[i] = (handle,handle.__name__[2:]) - else: - # If rule was specified as a string, we build an anonymous - # callback function to carry out the action - lexer.lexindexfunc[i] = (None,f[2:]) + lexobj.lexre, re_groups = _form_master_re(regex_list,reflags,ldict) + if debug: + for i in range(len(re_groups)): + print "lex: regex[%d] = '%s'" % (i, re_groups[i]) # If a lextab was specified, we create a file containing the precomputed # regular expression and index table @@ -637,26 +706,32 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): if lextab and optimize: lt = open(lextab+".py","w") lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab) - lt.write("_lexre = %s\n" % repr(regex)) + + tabre = [] + for i in range(len(re_groups)): + indexf = [] + for t in lexobj.lexre[i][1]: + if t: + if t[0]: + indexf.append((t[0].__name__,t[1])) + else: + indexf.append((None,t[1])) + else: + indexf.append(None) + tabre.append((re_groups[i],indexf)) + + lt.write("_lexre = %s\n" % repr(tabre)) + + # Create an alternative lexre representation + lt.write("_lexreflags = %d\n" % reflags) - lt.write("_lextab = [\n"); - for i in range(0,len(lexer.lexindexfunc)): - t = lexer.lexindexfunc[i] - if t: - if t[0]: - lt.write(" ('%s',%s),\n"% (t[0].__name__, repr(t[1]))) - else: - lt.write(" (None,%s),\n" % repr(t[1])) - else: - lt.write(" None,\n") - - lt.write("]\n"); - lt.write("_lextokens = %s\n" % repr(lexer.lextokens)) - lt.write("_lexignore = %s\n" % repr(lexer.lexignore)) - if (lexer.lexerrorf): - lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__)) + lt.write("_lextokens = %s\n" % repr(lexobj.lextokens)) + lt.write("_lexignore = %s\n" % repr(lexobj.lexignore)) + if (lexobj.lexerrorf): + lt.write("_lexerrorf = %s\n" % repr(lexobj.lexerrorf.__name__)) else: lt.write("_lexerrorf = None\n") + lt.write("_lexliterals = %s\n" % repr(lexobj.lexliterals)) lt.close() except re.error,e: @@ -664,16 +739,17 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0): error = 1 if error: raise SyntaxError,"lex: Unable to build lexer." - if not lexer.lexerrorf: + if not lexobj.lexerrorf: print "lex: Warning. no t_error rule is defined." - if not lexer.lexignore: lexer.lexignore = "" + if not lexobj.lexignore: lexobj.lexignore = "" # Create global versions of the token() and input() functions - token = lexer.token - input = lexer.input + token = lexobj.token + input = lexobj.input + lexer = lexobj - return lexer + return lexobj # ----------------------------------------------------------------------------- # run() @@ -705,8 +781,22 @@ def runmain(lexer=None,data=None): while 1: tok = _token() if not tok: break - print "(%s,%r,%d)" % (tok.type, tok.value, tok.lineno) + print "(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos) + +# ----------------------------------------------------------------------------- +# @TOKEN(regex) +# +# This decorator function can be used to set the regex expression on a function +# when its docstring might need to be set in an alternative way +# ----------------------------------------------------------------------------- + +def TOKEN(r): + def set_doc(f): + f.__doc__ = r + return f + return set_doc + diff --git a/tools/yacc.py b/tools/yacc.py index cc5840c139..bd63d8d949 100755 --- a/tools/yacc.py +++ b/tools/yacc.py @@ -5,8 +5,6 @@ # # Copyright (C) 2001-2006, David M. Beazley # -# $Header: /cvs/projects/PLY/yacc.py,v 1.6 2004/05/26 20:51:34 beazley Exp $ -# # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either @@ -25,8 +23,10 @@ # # # This implements an LR parser that is constructed from grammar rules defined -# as Python functions. Roughly speaking, this module is a cross between -# John Aycock's Spark system and the GNU bison utility. +# as Python functions. The grammer is specified by supplying the BNF inside +# Python documentation strings. The inspiration for this technique was borrowed +# from John Aycock's Spark parsing system. PLY might be viewed as cross between +# Spark and the GNU bison utility. # # The current implementation is only somewhat object-oriented. The # LR parser itself is defined in terms of an object (which allows multiple @@ -36,8 +36,10 @@ # time using threads (in which case they should have their head examined). # # This implementation supports both SLR and LALR(1) parsing. LALR(1) -# support was implemented by Elias Ioup (ezioup@alumni.uchicago.edu) -# and hacked abit by Dave to run faster. +# support was originally implemented by Elias Ioup (ezioup@alumni.uchicago.edu), +# using the algorithm found in Aho, Sethi, and Ullman "Compilers: Principles, +# Techniques, and Tools" (The Dragon Book). LALR(1) has since been replaced +# by the more efficient DeRemer and Pennello algorithm. # # :::::::: WARNING ::::::: # @@ -48,7 +50,7 @@ # own risk! # ---------------------------------------------------------------------------- -__version__ = "1.8" +__version__ = "2.1" import types @@ -63,7 +65,7 @@ yaccdebug = 1 # Debugging mode. If set, yacc generates a debug_file = 'parser.out' # Default name of the debugging file tab_module = 'parsetab' # Default name of the table module -default_lr = 'SLR' # Default LR table generation method +default_lr = 'LALR' # Default LR table generation method error_count = 3 # Number of symbols that must be shifted to leave recovery mode @@ -100,13 +102,15 @@ class YaccSymbol: # for a symbol. class YaccProduction: - def __init__(self,s): + def __init__(self,s,stack=None): self.slice = s self.pbstack = [] + self.stack = stack def __getitem__(self,n): if type(n) == types.IntType: - return self.slice[n].value + if n >= 0: return self.slice[n].value + else: return self.stack[n].value else: return [s.value for s in self.slice[n.start:n.stop:n.step]] @@ -161,7 +165,7 @@ class Parser: del self.statestack[:] del self.symstack[:] sym = YaccSymbol() - sym.type = '$' + sym.type = '$end' self.symstack.append(sym) self.statestack.append(0) @@ -177,7 +181,8 @@ class Parser: # If no lexer was given, we will try to use the lex module if not lexer: - import lex as lexer + import lex + lexer = lex.lexer pslice.lexer = lexer @@ -193,12 +198,13 @@ class Parser: symstack = [ ] # Stack of grammar symbols self.symstack = symstack + pslice.stack = symstack # Put in the production errtoken = None # Err token - # The start state is assumed to be (0,$) + # The start state is assumed to be (0,$end) statestack.append(0) sym = YaccSymbol() - sym.type = '$' + sym.type = '$end' symstack.append(sym) while 1: @@ -214,7 +220,7 @@ class Parser: lookahead = lookaheadstack.pop() if not lookahead: lookahead = YaccSymbol() - lookahead.type = '$' + lookahead.type = '$end' if debug: errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip() @@ -225,14 +231,10 @@ class Parser: if debug > 1: print 'action', t - if t is None: - t = actions.get((s,'#'),None) - if debug > 1: - print 'default action', t if t is not None: if t > 0: # shift a symbol on the stack - if ltype == '$': + if ltype == '$end': # Error, end of input sys.stderr.write("yacc: Parse error. EOF\n") return @@ -311,7 +313,7 @@ class Parser: if not self.errorcount: self.errorcount = error_count errtoken = lookahead - if errtoken.type == '$': + if errtoken.type == '$end': errtoken = None # End of file! if self.errorfunc: global errok,token,restart @@ -347,7 +349,7 @@ class Parser: # entire parse has been rolled back and we're completely hosed. The token is # discarded and we just keep going. - if len(statestack) <= 1 and lookahead.type != '$': + if len(statestack) <= 1 and lookahead.type != '$end': lookahead = None errtoken = None # Nuke the pushback stack @@ -358,7 +360,7 @@ class Parser: # at the end of the file. nuke the top entry and generate an error token # Start nuking entries on the stack - if lookahead.type == '$': + if lookahead.type == '$end': # Whoa. We're really hosed here. Bail out return @@ -465,9 +467,6 @@ def initialize_vars(): global Nonterminals, First, Follow, Precedence, LRitems global Errorfunc, Signature, Requires - # LALR(1) globals - global Prodempty, TReductions, NTReductions, GotoSetNum, Canonical - Productions = [None] # A list of all of the productions. The first # entry is always reserved for the purpose of # building an augmented grammar @@ -502,22 +501,6 @@ def initialize_vars(): Requires = { } # Requires list - # LALR(1) Initialization - Prodempty = { } # A dictionary of all productions that have an empty rule - # of the form P : <empty> - - TReductions = { } # A dictionary of precomputer reductions from - # nonterminals to terminals - - NTReductions = { } # A dictionary of precomputed reductions from - # nonterminals to nonterminals - - GotoSetNum = { } # A dictionary that remembers goto sets based on - # the state number and symbol - - Canonical = { } # A list of LR item sets. A LR item set is a list of LR - # items that represent the state of the parser - # File objects used when creating the parser.out debugging file global _vf, _vfc _vf = cStringIO.StringIO() @@ -635,7 +618,20 @@ def add_production(f,file,line,prodname,syms): sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname)) return -1 - for s in syms: + for x in range(len(syms)): + s = syms[x] + if s[0] in "'\"": + try: + c = eval(s) + if (len(c) > 1): + sys.stderr.write("%s:%d: Literal token %s in rule '%s' may only be a single character\n" % (file,line,s, prodname)) + return -1 + if not Terminals.has_key(c): + Terminals[c] = [] + syms[x] = c + continue + except SyntaxError: + pass if not is_identifier(s) and s != '%prec': sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname)) return -1 @@ -766,8 +762,12 @@ def add_function(f): if assign != ':' and assign != '::=': sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline)) return -1 + + e = add_production(f,file,dline,prodname,syms) error += e + + except StandardError: sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps)) error -= 1 @@ -823,7 +823,7 @@ def compute_terminates(): for t in Terminals.keys(): Terminates[t] = 1 - Terminates['$'] = 1 + Terminates['$end'] = 1 # Nonterminals: @@ -1064,14 +1064,14 @@ def first(beta): # that might follow it. Dragon book, p. 189. def compute_follow(start=None): - # Add '$' to the follow list of the start symbol + # Add '$end' to the follow list of the start symbol for k in Nonterminals.keys(): Follow[k] = [ ] if not start: start = Productions[1].name - Follow[start] = [ '$' ] + Follow[start] = [ '$end' ] while 1: didadd = 0 @@ -1113,7 +1113,7 @@ def compute_first1(): for t in Terminals.keys(): First[t] = [t] - First['$'] = ['$'] + First['$end'] = ['$end'] First['#'] = ['#'] # what's this for? # Nonterminals: @@ -1215,55 +1215,16 @@ def lr0_goto(I,x): s[id(n)] = s1 gs.append(n) s = s1 - g = s.get('$',None) + g = s.get('$end',None) if not g: if gs: g = lr0_closure(gs) - s['$'] = g + s['$end'] = g else: - s['$'] = gs + s['$end'] = gs _lr_goto_cache[(id(I),x)] = g return g -# Added for LALR(1) - -# Given a setnumber of an lr0 state and a symbol return the setnumber of the goto state -def lr0_goto_setnumber(I_setnumber, x): - global Canonical - global GotoSetNum - - if GotoSetNum.has_key((I_setnumber, x)): - setnumber = GotoSetNum[(I_setnumber, x)] - else: - gset = lr0_goto(Canonical[I_setnumber], x) - if not gset: - return -1 - else: - gsetlen = len(gset) - for i in xrange(len(gset[0].setnumbers)): - inall = 1 - for item in gset: - if not item.setnumbers[i]: - inall = 0 - break - if inall and len(Canonical[i]) == gsetlen: - setnumber = i - break # Note: DB. I added this to improve performance. - # Not sure if this breaks the algorithm (it doesn't appear to). - - GotoSetNum[(I_setnumber, x)] = setnumber - - return setnumber - -# Compute the kernel of a set of LR(0) items -def lr0_kernel(I): - KI = [ ] - for p in I: - if p.name == "S'" or p.lr_index > 0 or p.len == 0: - KI.append(p) - - return KI - _lr0_cidhash = { } # Compute the LR(0) sets of item function @@ -1297,468 +1258,382 @@ def lr0_items(): return C # ----------------------------------------------------------------------------- -# slr_parse_table() +# ==== LALR(1) Parsing ==== +# +# LALR(1) parsing is almost exactly the same as SLR except that instead of +# relying upon Follow() sets when performing reductions, a more selective +# lookahead set that incorporates the state of the LR(0) machine is utilized. +# Thus, we mainly just have to focus on calculating the lookahead sets. +# +# The method used here is due to DeRemer and Pennelo (1982). +# +# DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1) +# Lookahead Sets", ACM Transactions on Programming Languages and Systems, +# Vol. 4, No. 4, Oct. 1982, pp. 615-649 # -# This function constructs an SLR table. +# Further details can also be found in: +# +# J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing", +# McGraw-Hill Book Company, (1985). +# +# Note: This implementation is a complete replacement of the LALR(1) +# implementation in PLY-1.x releases. That version was based on +# a less efficient algorithm and it had bugs in its implementation. # ----------------------------------------------------------------------------- -def slr_parse_table(): - global _lr_method - goto = _lr_goto # Goto array - action = _lr_action # Action array - actionp = { } # Action production array (temporary) - - _lr_method = "SLR" - - n_srconflict = 0 - n_rrconflict = 0 - if yaccdebug: - sys.stderr.write("yacc: Generating SLR parsing table...\n") - _vf.write("\n\nParsing method: SLR\n\n") - - # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items - # This determines the number of states - - C = lr0_items() +# ----------------------------------------------------------------------------- +# compute_nullable_nonterminals() +# +# Creates a dictionary containing all of the non-terminals that might produce +# an empty production. +# ----------------------------------------------------------------------------- - # Build the parser table, state by state - st = 0 - for I in C: - # Loop over each production in I - actlist = [ ] # List of actions - - if yaccdebug: - _vf.write("\nstate %d\n\n" % st) - for p in I: - _vf.write(" (%d) %s\n" % (p.number, str(p))) - _vf.write("\n") +def compute_nullable_nonterminals(): + nullable = {} + num_nullable = 0 + while 1: + for p in Productions[1:]: + if p.len == 0: + nullable[p.name] = 1 + continue + for t in p.prod: + if not nullable.has_key(t): break + else: + nullable[p.name] = 1 + if len(nullable) == num_nullable: break + num_nullable = len(nullable) + return nullable - for p in I: - try: - if p.prod[-1] == ".": - if p.name == "S'": - # Start symbol. Accept! - action[st,"$"] = 0 - actionp[st,"$"] = p - else: - # We are at the end of a production. Reduce! - for a in Follow[p.name]: - actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) - r = action.get((st,a),None) - if r is not None: - # Whoa. Have a shift/reduce or reduce/reduce conflict - if r > 0: - # Need to decide on shift or reduce here - # By default we favor shifting. Need to add - # some precedence rules here. - sprec,slevel = Productions[actionp[st,a].number].prec - rprec,rlevel = Precedence.get(a,('right',0)) - if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): - # We really need to reduce here. - action[st,a] = -p.number - actionp[st,a] = p - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - n_srconflict += 1 - elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: - # Hmmm. Guess we'll keep the shift - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - n_srconflict +=1 - elif r < 0: - # Reduce/reduce conflict. In this case, we favor the rule - # that was defined first in the grammar file - oldp = Productions[-r] - pp = Productions[p.number] - if oldp.line > pp.line: - action[st,a] = -p.number - actionp[st,a] = p - # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st) - n_rrconflict += 1 - _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a])) - _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a])) - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - action[st,a] = -p.number - actionp[st,a] = p - else: - i = p.lr_index - a = p.prod[i+1] # Get symbol right after the "." - if Terminals.has_key(a): - g = lr0_goto(I,a) - j = _lr0_cidhash.get(id(g),-1) - if j >= 0: - # We are in a shift state - actlist.append((a,p,"shift and go to state %d" % j)) - r = action.get((st,a),None) - if r is not None: - # Whoa have a shift/reduce or shift/shift conflict - if r > 0: - if r != j: - sys.stderr.write("Shift/shift conflict in state %d\n" % st) - elif r < 0: - # Do a precedence check. - # - if precedence of reduce rule is higher, we reduce. - # - if precedence of reduce is same and left assoc, we reduce. - # - otherwise we shift - rprec,rlevel = Productions[actionp[st,a].number].prec - sprec,slevel = Precedence.get(a,('right',0)) - if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): - # We decide to shift here... highest precedence to shift - action[st,a] = j - actionp[st,a] = p - if not slevel and not rlevel: - n_srconflict += 1 - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: - # Hmmm. Guess we'll keep the reduce - if not slevel and not rlevel: - n_srconflict +=1 - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - action[st,a] = j - actionp[st,a] = p - - except StandardError,e: - raise YaccError, "Hosed in slr_parse_table", e +# ----------------------------------------------------------------------------- +# find_nonterminal_trans(C) +# +# Given a set of LR(0) items, this functions finds all of the non-terminal +# transitions. These are transitions in which a dot appears immediately before +# a non-terminal. Returns a list of tuples of the form (state,N) where state +# is the state number and N is the nonterminal symbol. +# +# The input C is the set of LR(0) items. +# ----------------------------------------------------------------------------- - # Print the actions associated with each terminal - if yaccdebug: - _actprint = { } - for a,p,m in actlist: - if action.has_key((st,a)): - if p is actionp[st,a]: - _vf.write(" %-15s %s\n" % (a,m)) - _actprint[(a,m)] = 1 - _vf.write("\n") - for a,p,m in actlist: - if action.has_key((st,a)): - if p is not actionp[st,a]: - if not _actprint.has_key((a,m)): - _vf.write(" ! %-15s [ %s ]\n" % (a,m)) - _actprint[(a,m)] = 1 - - # Construct the goto table for this state - if yaccdebug: - _vf.write("\n") - nkeys = { } - for ii in I: - for s in ii.usyms: - if Nonterminals.has_key(s): - nkeys[s] = None - for n in nkeys.keys(): - g = lr0_goto(I,n) - j = _lr0_cidhash.get(id(g),-1) - if j >= 0: - goto[st,n] = j - if yaccdebug: - _vf.write(" %-30s shift and go to state %d\n" % (n,j)) +def find_nonterminal_transitions(C): + trans = [] + for state in range(len(C)): + for p in C[state]: + if p.lr_index < p.len - 1: + t = (state,p.prod[p.lr_index+1]) + if Nonterminals.has_key(t[1]): + if t not in trans: trans.append(t) + state = state + 1 + return trans - st += 1 +# ----------------------------------------------------------------------------- +# dr_relation() +# +# Computes the DR(p,A) relationships for non-terminal transitions. The input +# is a tuple (state,N) where state is a number and N is a nonterminal symbol. +# +# Returns a list of terminals. +# ----------------------------------------------------------------------------- - if yaccdebug: - if n_srconflict == 1: - sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) - if n_srconflict > 1: - sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) - if n_rrconflict == 1: - sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) - if n_rrconflict > 1: - sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) +def dr_relation(C,trans,nullable): + dr_set = { } + state,N = trans + terms = [] + g = lr0_goto(C[state],N) + for p in g: + if p.lr_index < p.len - 1: + a = p.prod[p.lr_index+1] + if Terminals.has_key(a): + if a not in terms: terms.append(a) + # This extra bit is to handle the start state + if state == 0 and N == Productions[0].prod[0]: + terms.append('$end') + + return terms # ----------------------------------------------------------------------------- -# ==== LALR(1) Parsing ==== -# FINISHED! 5/20/2003 by Elias Ioup +# reads_relation() +# +# Computes the READS() relation (p,A) READS (t,C). # ----------------------------------------------------------------------------- +def reads_relation(C, trans, empty): + # Look for empty transitions + rel = [] + state, N = trans -# Compute the lr1_closure of a set I. I is a list of productions and setnumber -# is the state that you want the lr items that are made from the to come from. + g = lr0_goto(C[state],N) + j = _lr0_cidhash.get(id(g),-1) + for p in g: + if p.lr_index < p.len - 1: + a = p.prod[p.lr_index + 1] + if empty.has_key(a): + rel.append((j,a)) -_lr1_add_count = 0 + return rel -def lr1_closure(I, setnumber = 0): - global _add_count - global Nonterminals +# ----------------------------------------------------------------------------- +# compute_lookback_includes() +# +# Determines the lookback and includes relations +# +# LOOKBACK: +# +# This relation is determined by running the LR(0) state machine forward. +# For example, starting with a production "N : . A B C", we run it forward +# to obtain "N : A B C ." We then build a relationship between this final +# state and the starting state. These relationships are stored in a dictionary +# lookdict. +# +# INCLUDES: +# +# Computes the INCLUDE() relation (p,A) INCLUDES (p',B). +# +# This relation is used to determine non-terminal transitions that occur +# inside of other non-terminal transition states. (p,A) INCLUDES (p', B) +# if the following holds: +# +# B -> LAT, where T -> epsilon and p' -L-> p +# +# L is essentially a prefix (which may be empty), T is a suffix that must be +# able to derive an empty string. State p' must lead to state p with the string L. +# +# ----------------------------------------------------------------------------- - _add_count += 1 - prodlist = Productions +def compute_lookback_includes(C,trans,nullable): + + lookdict = {} # Dictionary of lookback relations + includedict = {} # Dictionary of include relations - # Add everything in I to J - J = I[:] - Jhash = { } - for j in J: - Jhash[id(j)] = 1 + # Make a dictionary of non-terminal transitions + dtrans = {} + for t in trans: + dtrans[t] = 1 + + # Loop over all transitions and compute lookbacks and includes + for state,N in trans: + lookb = [] + includes = [] + for p in C[state]: + if p.name != N: continue - didadd = 1 - while didadd: - didadd = 0 - for j in J: - jprod = j.prod - jlr_index = j.lr_index - jprodslice = jprod[jlr_index+2:] - - if jlr_index < len(jprod) - 1 and Nonterminals.has_key(jprod[jlr_index+1]): - first_syms = [] - - if j.lk_added.setdefault(setnumber, 0) < len(j.lookaheads[setnumber]): - for a in j.lookaheads[setnumber][j.lk_added[setnumber]:]: - # find b in FIRST(Xa) if j = [A->a.BX,a] - temp_first_syms = first(jprodslice + (a,)) - for x in temp_first_syms: - if x not in first_syms: - first_syms.append(x) - - j.lk_added[setnumber] = len(j.lookaheads[setnumber]) - - for x in j.lrafter: - - # Add B --> .G to J - if x.lr_next.lookaheads.has_key(setnumber): - _xlook = x.lr_next.lookaheads[setnumber] - for s in first_syms: - if s not in _xlook: - _xlook.append(s) - didadd = 1 - else: - x.lr_next.lookaheads[setnumber] = first_syms - didadd = 1 - - nid = id(x.lr_next) - if not Jhash.has_key(nid): - J.append(x.lr_next) - Jhash[nid] = 1 - - return J + # Okay, we have a name match. We now follow the production all the way + # through the state machine until we get the . on the right hand side + + lr_index = p.lr_index + j = state + while lr_index < p.len - 1: + lr_index = lr_index + 1 + t = p.prod[lr_index] + + # Check to see if this symbol and state are a non-terminal transition + if dtrans.has_key((j,t)): + # Yes. Okay, there is some chance that this is an includes relation + # the only way to know for certain is whether the rest of the + # production derives empty + + li = lr_index + 1 + while li < p.len: + if Terminals.has_key(p.prod[li]): break # No forget it + if not nullable.has_key(p.prod[li]): break + li = li + 1 + else: + # Appears to be a relation between (j,t) and (state,N) + includes.append((j,t)) + + g = lr0_goto(C[j],t) # Go to next set + j = _lr0_cidhash.get(id(g),-1) # Go to next state + + # When we get here, j is the final state, now we have to locate the production + for r in C[j]: + if r.name != p.name: continue + if r.len != p.len: continue + i = 0 + # This look is comparing a production ". A B C" with "A B C ." + while i < r.lr_index: + if r.prod[i] != p.prod[i+1]: break + i = i + 1 + else: + lookb.append((j,r)) + for i in includes: + if not includedict.has_key(i): includedict[i] = [] + includedict[i].append((state,N)) + lookdict[(state,N)] = lookb + + return lookdict,includedict -def add_lookaheads(K): - spontaneous = [] - propogate = [] - - for setnumber in range(len(K)): - for kitem in K[setnumber]: - kitem.lookaheads[setnumber] = ['#'] - J = lr1_closure([kitem], setnumber) - - # find the lookaheads that are spontaneously created from closures - # and the propogations of lookaheads between lr items - for item in J: - if item.lr_index < len(item.prod)-1: - for lookahead in item.lookaheads[setnumber]: - goto_setnumber = lr0_goto_setnumber(setnumber, item.prod[item.lr_index+1]) - next = None - if lookahead != '#': - if item.lr_next in K[goto_setnumber]: - next = item.lr_next - if next: - spontaneous.append((next, (lookahead, goto_setnumber))) - else: - if goto_setnumber > -1: - if item.lr_next in K[goto_setnumber]: - next = item.lr_next - - if next: - propogate.append(((kitem, setnumber), (next, goto_setnumber))) +# ----------------------------------------------------------------------------- +# digraph() +# traverse() +# +# The following two functions are used to compute set valued functions +# of the form: +# +# F(x) = F'(x) U U{F(y) | x R y} +# +# This is used to compute the values of Read() sets as well as FOLLOW sets +# in LALR(1) generation. +# +# Inputs: X - An input set +# R - A relation +# FP - Set-valued function +# ------------------------------------------------------------------------------ + +def digraph(X,R,FP): + N = { } + for x in X: + N[x] = 0 + stack = [] + F = { } + for x in X: + if N[x] == 0: traverse(x,N,stack,F,X,R,FP) + return F + +def traverse(x,N,stack,F,X,R,FP): + stack.append(x) + d = len(stack) + N[x] = d + F[x] = FP(x) # F(X) <- F'(x) + + rel = R(x) # Get y's related to x + for y in rel: + if N[y] == 0: + traverse(y,N,stack,F,X,R,FP) + N[x] = min(N[x],N[y]) + for a in F.get(y,[]): + if a not in F[x]: F[x].append(a) + if N[x] == d: + N[stack[-1]] = sys.maxint + F[stack[-1]] = F[x] + element = stack.pop() + while element != x: + N[stack[-1]] = sys.maxint + F[stack[-1]] = F[x] + element = stack.pop() - +# ----------------------------------------------------------------------------- +# compute_read_sets() +# +# Given a set of LR(0) items, this function computes the read sets. +# +# Inputs: C = Set of LR(0) items +# ntrans = Set of nonterminal transitions +# nullable = Set of empty transitions +# +# Returns a set containing the read sets +# ----------------------------------------------------------------------------- - for x in K[setnumber]: - x.lookaheads[setnumber] = [] +def compute_read_sets(C, ntrans, nullable): + FP = lambda x: dr_relation(C,x,nullable) + R = lambda x: reads_relation(C,x,nullable) + F = digraph(ntrans,R,FP) + return F - for x in spontaneous: - if x[1][0] not in x[0].lookaheads[x[1][1]]: - x[0].lookaheads[x[1][1]].append(x[1][0]) +# ----------------------------------------------------------------------------- +# compute_follow_sets() +# +# Given a set of LR(0) items, a set of non-terminal transitions, a readset, +# and an include set, this function computes the follow sets +# +# Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)} +# +# Inputs: +# ntrans = Set of nonterminal transitions +# readsets = Readset (previously computed) +# inclsets = Include sets (previously computed) +# +# Returns a set containing the follow sets +# ----------------------------------------------------------------------------- - K[0][0].lookaheads[0] = ['$'] +def compute_follow_sets(ntrans,readsets,inclsets): + FP = lambda x: readsets[x] + R = lambda x: inclsets.get(x,[]) + F = digraph(ntrans,R,FP) + return F - pitems = {} - for x in propogate: - if pitems.has_key(x[0]): - pitems[x[0]].append(x[1]) - else: - pitems[x[0]] = [] - pitems[x[0]].append(x[1]) - - # propogate the lookaheads that were spontaneously generated - # based on the propogations produced above - stop = 0 - - while not stop: - stop = 1 - kindex = 0 - for set in K: - for item in set: - pkey = (item, kindex) - if pitems.has_key(pkey): - for propogation in pitems[pkey]: - gitem = propogation[0] - gsetnumber = propogation[1] - glookaheads = gitem.lookaheads[gsetnumber] - for lookahead in item.lookaheads[kindex]: - if lookahead not in glookaheads: - glookaheads.append(lookahead) - stop = 0 - kindex += 1 - -def ReduceNonterminals(): - global Nonterminals - - global TReductions - global NTReductions - - for nt in Nonterminals.keys(): - TReductions[nt] = [] - NTReductions[nt] = [] - - for nt in Nonterminals.keys(): - terms = ReduceToTerminals(nt) - TReductions[nt].extend(terms) - if not NTReductions.has_key(nt): - ReduceToNonterminals(nt) - +# ----------------------------------------------------------------------------- +# add_lookaheads() +# +# Attaches the lookahead symbols to grammar rules. +# +# Inputs: lookbacks - Set of lookback relations +# followset - Computed follow set +# +# This function directly attaches the lookaheads to productions contained +# in the lookbacks set +# ----------------------------------------------------------------------------- +def add_lookaheads(lookbacks,followset): + for trans,lb in lookbacks.items(): + # Loop over productions in lookback + for state,p in lb: + if not p.lookaheads.has_key(state): + p.lookaheads[state] = [] + f = followset.get(trans,[]) + for a in f: + if a not in p.lookaheads[state]: p.lookaheads[state].append(a) + +# ----------------------------------------------------------------------------- +# add_lalr_lookaheads() +# +# This function does all of the work of adding lookahead information for use +# with LALR parsing +# ----------------------------------------------------------------------------- -def ReduceToTerminals(nt,cyclemap=None): - global Prodnames - global Terminals - reducedterminals = [] - if not cyclemap: cyclemap = {} +def add_lalr_lookaheads(C): + # Determine all of the nullable nonterminals + nullable = compute_nullable_nonterminals() - if cyclemap.has_key(nt): return [] - cyclemap[nt] = 1 + # Find all non-terminal transitions + trans = find_nonterminal_transitions(C) - for p in Prodnames[nt]: - if len(p.prod) > 0: - if Terminals.has_key(p.prod[0]): - if p.prod[0] not in reducedterminals: - reducedterminals.append(p.prod[0]) - else: - if p.prod[0] != nt: - terms = ReduceToTerminals(p.prod[0],cyclemap) - for t in terms: - if t not in reducedterminals: - reducedterminals.append(t) - del cyclemap[nt] - return reducedterminals + # Compute read sets + readsets = compute_read_sets(C,trans,nullable) - -def ReduceToNonterminals(nt): - global Prodnames - global Nonterminals - global NTReductions - reducednonterminals = [] - - for p in Prodnames[nt]: - if len(p.prod) > 0: - if Nonterminals.has_key(p.prod[0]): - if p.prod[0] not in reducednonterminals: - reducednonterminals.append(p.prod[0]) - if p.prod[0] != nt: - if not NTReductions.has_key(p.prod[0]): - ReduceToNonterminals(p.prod[0]) - - nterms = NTReductions[p.prod[0]] - for nt in nterms: - if nt not in reducednonterminals: - reducednonterminals.append(nt) - + # Compute lookback/includes relations + lookd, included = compute_lookback_includes(C,trans,nullable) - NTReductions[nt] = reducednonterminals + # Compute LALR FOLLOW sets + followsets = compute_follow_sets(trans,readsets,included) + + # Add all of the lookaheads + add_lookaheads(lookd,followsets) # ----------------------------------------------------------------------------- -# lalr_parse_table() +# lr_parse_table() # -# This function constructs an LALR table. +# This function constructs the parse tables for SLR or LALR # ----------------------------------------------------------------------------- -def lalr_parse_table(): +def lr_parse_table(method): global _lr_method goto = _lr_goto # Goto array action = _lr_action # Action array actionp = { } # Action production array (temporary) - goto_cache = _lr_goto_cache - cid_hash = _lr0_cidhash - - _lr_method = "LALR" + + _lr_method = method n_srconflict = 0 n_rrconflict = 0 if yaccdebug: - sys.stderr.write("yacc: Generating LALR(1) parsing table...\n") - _vf.write("\n\nParsing method: LALR(1)\n\n") + sys.stderr.write("yacc: Generating %s parsing table...\n" % method) + _vf.write("\n\nParsing method: %s\n\n" % method) # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items # This determines the number of states - + C = lr0_items() - global Canonical - Canonical = C - - ### - # Create the kernel states. - ### - K = [] - setC = [0]*len(C) - for x in C: - K.append(lr0_kernel(x)) - for y in x: - y.setnumbers = setC[:] - - _cindex = 0 - for x in C: - for y in x: - y.lookaheads[_cindex] = [] - y.setnumbers[_cindex] = 1 - _cindex = _cindex + 1 - - ### - # Add lookaheads to the lr items - ### - - add_lookaheads(K) - - ### - # Do the reductions for parsing first and keep them in globals - ### - - ReduceNonterminals() - - global TReductions - global NTReductions - global Prodempty - - EmptyAncestors = {} - for y in Prodempty.keys(): - EmptyAncestors[y] = [] - for x in NTReductions.items(): - for y in x[1]: - if Prodempty.has_key(y): - EmptyAncestors[y].append(x[0]) - + if method == 'LALR': + add_lalr_lookaheads(C) # Build the parser table, state by state st = 0 for I in C: # Loop over each production in I actlist = [ ] # List of actions - acthash = { } - - idI = id(I) if yaccdebug: _vf.write("\nstate %d\n\n" % st) @@ -1766,86 +1641,20 @@ def lalr_parse_table(): _vf.write(" (%d) %s\n" % (p.number, str(p))) _vf.write("\n") - global First for p in I: try: if p.prod[-1] == ".": if p.name == "S'": # Start symbol. Accept! - action[st,"$"] = 0 - actionp[st,"$"] = p - elif len(p.prod) == 0: - ancestors = EmptyAncestors[p.name] - for i in ancestors: - for s in K: - if i in s: - input_list = [] - plist = Productions[i.name] - for x in plist: - if len(x.prod) > 0 and x.prod[0] == p.name: - n = p.prod[1:] - d = x.prod[lr_index+2:] - for l in x.lookaheads.items(): - flist = First[tuple(n+d+[l])] - for f in flist: - if f not in input_list and f in p.lookaheads[st]: - input_list.append(f) - - # We are at the end of a production. Reduce! - #print "input_list: %s" % input_list - #print "Follow[p.name]: %s" % Follow[p.name] - for a in input_list: - actlist.append((a,p,"reduce using rule %d (%s) " % (p.number,p))) - r = action.get((st,a),None) - if r is not None: - # Whoa. Have a shift/reduce or reduce/reduce conflict - if r > 0: - # Need to decide on shift or reduce here - # By default we favor shifting. Need to add - # some precedence rules here. - sprec,slevel = Productions[actionp[st,a].number].prec - rprec,rlevel = Precedence.get(a,('right',0)) - if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): - # We really need to reduce here. - action[st,a] = -p.number - actionp[st,a] = p - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - n_srconflict += 1 - elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: - # Hmmm. Guess we'll keep the shift - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - n_srconflict +=1 - elif r < 0: - # Reduce/reduce conflict. In this case, we favor the rule - # that was defined first in the grammar file - oldp = Productions[-r] - pp = Productions[p.number] - if oldp.line > pp.line: - action[st,a] = -p.number - actionp[st,a] = p - # print "Reduce/reduce conflict in state %d" % st - n_rrconflict += 1 - _vfc.write("reduce/reduce conflict in state %d resolved using rule %d.\n" % (st, actionp[st,a].number)) - _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d.\n" % (a,actionp[st,a].number)) - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - action[st,a] = -p.number - actionp[st,a] = p - - break # break out of the for s in K loop because we only want to make - # sure that a production is in the Kernel - + action[st,"$end"] = 0 + actionp[st,"$end"] = p else: # We are at the end of a production. Reduce! - - for a in p.lookaheads[st]: + if method == 'LALR': + laheads = p.lookaheads[st] + else: + laheads = Follow[p.name] + for a in laheads: actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) r = action.get((st,a),None) if r is not None: @@ -1855,7 +1664,7 @@ def lalr_parse_table(): # By default we favor shifting. Need to add # some precedence rules here. sprec,slevel = Productions[actionp[st,a].number].prec - rprec,rlevel = Precedence.get(a,('right',0)) + rprec,rlevel = Precedence.get(a,('right',0)) if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): # We really need to reduce here. action[st,a] = -p.number @@ -1880,12 +1689,12 @@ def lalr_parse_table(): if oldp.line > pp.line: action[st,a] = -p.number actionp[st,a] = p - # print "Reduce/reduce conflict in state %d" % st + # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st) n_rrconflict += 1 - _vfc.write("reduce/reduce conflict in state %d resolved using rule %d.\n" % (st, actionp[st,a].number)) - _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d.\n" % (a,actionp[st,a].number)) + _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a])) + _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a])) else: - print "Unknown conflict in state %d" % st + sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = -p.number actionp[st,a] = p @@ -1893,14 +1702,11 @@ def lalr_parse_table(): i = p.lr_index a = p.prod[i+1] # Get symbol right after the "." if Terminals.has_key(a): - g = goto_cache[(idI,a)] - j = cid_hash.get(id(g),-1) + g = lr0_goto(I,a) + j = _lr0_cidhash.get(id(g),-1) if j >= 0: # We are in a shift state - _k = (a,j) - if not acthash.has_key(_k): - actlist.append((a,p,"shift and go to state %d" % j)) - acthash[_k] = 1 + actlist.append((a,p,"shift and go to state %d" % j)) r = action.get((st,a),None) if r is not None: # Whoa have a shift/reduce or shift/shift conflict @@ -1936,102 +1742,54 @@ def lalr_parse_table(): else: action[st,a] = j actionp[st,a] = p - else: - nonterminal = a - term_list = TReductions[nonterminal] - # DB: This loop gets executed a lot. Try to optimize - for a in term_list: - g = goto_cache[(idI,a)] - j = cid_hash[id(g)] - if j >= 0: - # We are in a shift state - # Don't put repeated shift actions on action list (performance hack) - _k = (a,j) - if not acthash.has_key(_k): - actlist.append((a,p,"shift and go to state "+str(j))) - acthash[_k] = 1 - - r = action.get((st,a),None) - if r is not None: - # Whoa have a shift/reduce or shift/shift conflict - if r > 0: - if r != j: - sys.stderr.write("Shift/shift conflict in state %d\n" % st) - continue - elif r < 0: - # Do a precedence check. - # - if precedence of reduce rule is higher, we reduce. - # - if precedence of reduce is same and left assoc, we reduce. - # - otherwise we shift - rprec,rlevel = Productions[actionp[st,a].number].prec - sprec,slevel = Precedence.get(a,('right',0)) - if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): - # We decide to shift here... highest precedence to shift - action[st,a] = j - actionp[st,a] = p - if not slevel and not rlevel: - n_srconflict += 1 - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - elif (slevel == rlevel) and (rprec == 'nonassoc'): - action[st,a] = None - else: - # Hmmm. Guess we'll keep the reduce - if not slevel and not rlevel: - n_srconflict +=1 - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - action[st,a] = j - actionp[st,a] = p - + except StandardError,e: - raise YaccError, "Hosed in lalr_parse_table", e + raise YaccError, "Hosed in lr_parse_table", e # Print the actions associated with each terminal if yaccdebug: + _actprint = { } for a,p,m in actlist: if action.has_key((st,a)): if p is actionp[st,a]: _vf.write(" %-15s %s\n" % (a,m)) + _actprint[(a,m)] = 1 _vf.write("\n") - for a,p,m in actlist: if action.has_key((st,a)): if p is not actionp[st,a]: - _vf.write(" ! %-15s [ %s ]\n" % (a,m)) + if not _actprint.has_key((a,m)): + _vf.write(" ! %-15s [ %s ]\n" % (a,m)) + _actprint[(a,m)] = 1 # Construct the goto table for this state + if yaccdebug: + _vf.write("\n") nkeys = { } for ii in I: for s in ii.usyms: if Nonterminals.has_key(s): nkeys[s] = None - - # Construct the goto table for this state for n in nkeys.keys(): g = lr0_goto(I,n) - j = cid_hash.get(id(g),-1) + j = _lr0_cidhash.get(id(g),-1) if j >= 0: goto[st,n] = j if yaccdebug: _vf.write(" %-30s shift and go to state %d\n" % (n,j)) st += 1 + if yaccdebug: if n_srconflict == 1: sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) if n_srconflict > 1: - sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) + sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) if n_rrconflict == 1: sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) if n_rrconflict > 1: sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) - # ----------------------------------------------------------------------------- # ==== LR Utility functions ==== # ----------------------------------------------------------------------------- @@ -2145,6 +1903,7 @@ del _lr_goto_items else: f.write(" None,\n") f.write("]\n") + f.close() except IOError,e: @@ -2193,9 +1952,6 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, files = { } error = 0 - # Add starting symbol to signature - if start: - Signature.update(start) # Add parsing method to signature Signature.update(method) @@ -2227,6 +1983,12 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, f = f.f_back # Walk out to our calling function ldict = f.f_globals # Grab its globals dictionary + # Add starting symbol to signature + if not start: + start = ldict.get("start",None) + if start: + Signature.update(start) + # If running in optimized mode. We're going to if (optimize and lr_read_tables(tabmodule,1)): @@ -2374,10 +2136,8 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, compute_first1() compute_follow(start) - if method == 'SLR': - slr_parse_table() - elif method == 'LALR': - lalr_parse_table() + if method in ['SLR','LALR']: + lr_parse_table(method) else: raise YaccError, "Unknown parsing method '%s'" % method @@ -2408,6 +2168,9 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, global parse parse = p.parse + global parser + parser = p + # Clean up all of the globals we created if (not optimize): yacc_cleanup() @@ -2423,12 +2186,10 @@ def yacc_cleanup(): global Productions, Prodnames, Prodmap, Terminals global Nonterminals, First, Follow, Precedence, LRitems global Errorfunc, Signature, Requires - global Prodempty, TReductions, NTReductions, GotoSetNum, Canonical del Productions, Prodnames, Prodmap, Terminals del Nonterminals, First, Follow, Precedence, LRitems del Errorfunc, Signature, Requires - del Prodempty, TReductions, NTReductions, GotoSetNum, Canonical global _vf, _vfc del _vf, _vfc |