aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTomas Kukosa <tomas.kukosa@siemens.com>2006-10-09 06:24:03 +0000
committerTomas Kukosa <tomas.kukosa@siemens.com>2006-10-09 06:24:03 +0000
commit410830c4e3e47fb506c2d792ab10f9ddd38e747d (patch)
tree2f401b4d2d304d85bf6ed7dd8a28df2fa5a0ed7e
parentcf201496162307772c31c19bba651bcbbf6e5be4 (diff)
Ply updated to version 2.1
svn path=/trunk/; revision=19459
-rwxr-xr-xtools/lex.py398
-rwxr-xr-xtools/yacc.py1057
2 files changed, 653 insertions, 802 deletions
diff --git a/tools/lex.py b/tools/lex.py
index 15af21357b..beaace02df 100755
--- a/tools/lex.py
+++ b/tools/lex.py
@@ -5,8 +5,6 @@
#
# Copyright (C) 2001-2006, David M. Beazley
#
-# $Header: /cvs/projects/PLY/lex.py,v 1.1.1.1 2004/05/21 15:34:10 beazley Exp $
-#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
@@ -41,20 +39,6 @@
# - The interface is geared towards LALR(1) and LR(1) parser
# generators. That is tokens are generated one at a time
# rather than being generated in advanced all in one step.
-#
-# There are a few limitations of this module
-#
-# - The module interface makes it somewhat awkward to support more
-# than one lexer at a time. Although somewhat inelegant from a
-# design perspective, this is rarely a practical concern for
-# most compiler projects.
-#
-# - The lexer requires that the entire input text be read into
-# a string before scanning. I suppose that most machines have
-# enough memory to make this a minor issues, but it makes
-# the lexer somewhat difficult to use in interactive sessions
-# or with streaming data.
-#
#-----------------------------------------------------------------------------
r"""
@@ -186,18 +170,19 @@ scanner you have defined.
# -----------------------------------------------------------------------------
-__version__ = "1.8"
+__version__ = "2.1"
import re, types, sys, copy
# Available instance types. This is used when lexers are defined by a class.
-# it's a little funky because I want to preserve backwards compatibility
+# It's a little funky because I want to preserve backwards compatibility
# with Python 2.0 where types.ObjectType is undefined.
try:
_INSTANCETYPE = (types.InstanceType, types.ObjectType)
except AttributeError:
_INSTANCETYPE = types.InstanceType
+ class object: pass # Note: needed if no new-style classes present
# Exception thrown when invalid token encountered and no default
class LexError(Exception):
@@ -206,9 +191,9 @@ class LexError(Exception):
self.text = s
# Token class
-class LexToken:
+class LexToken(object):
def __str__(self):
- return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno)
+ return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos)
def __repr__(self):
return str(self)
def skip(self,n):
@@ -220,42 +205,66 @@ class LexToken:
# -----------------------------------------------------------------------------
# Lexer class
#
+# This class encapsulates all of the methods and data associated with a lexer.
+#
# input() - Store a new string in the lexer
# token() - Get the next token
# -----------------------------------------------------------------------------
class Lexer:
def __init__(self):
- self.lexre = None # Master regular expression
- self.lexreflags = 0 # Option re compile flags
- self.lexdata = None # Actual input data (as a string)
- self.lexpos = 0 # Current position in input text
- self.lexlen = 0 # Length of the input text
- self.lexindexfunc = [ ] # Reverse mapping of groups to functions and types
- self.lexerrorf = None # Error rule (if any)
- self.lextokens = None # List of valid tokens
- self.lexignore = None # Ignored characters
- self.lineno = 1 # Current line number
- self.debug = 0 # Debugging mode
- self.optimize = 0 # Optimized mode
- self.token = self.errtoken
-
- def __copy__(self):
+ self.lexre = None # Master regular expression. This is a list of
+ # tuples (re,findex) where re is a compiled
+ # regular expression and findex is a list
+ # mapping regex group numbers to rules
+
+ self.lexreflags = 0 # Optional re compile flags
+ self.lexdata = None # Actual input data (as a string)
+ self.lexpos = 0 # Current position in input text
+ self.lexlen = 0 # Length of the input text
+ self.lexerrorf = None # Error rule (if any)
+ self.lextokens = None # List of valid tokens
+ self.lexignore = None # Ignored characters
+ self.lexliterals = "" # Literal characters that can be passed through
+ self.lexmodule = None # Module
+ self.lineno = 1 # Current line number
+ self.lexdebug = 0 # Debugging mode
+ self.lexoptimize = 0 # Optimized mode
+
+ def clone(self,object=None):
c = Lexer()
c.lexre = self.lexre
- c.lexreflags = self.lexreflags
+ c.lexreflags = self.lexreflags
c.lexdata = self.lexdata
c.lexpos = self.lexpos
c.lexlen = self.lexlen
- c.lexindexfunc = self.lexindexfunc
c.lexerrorf = self.lexerrorf
c.lextokens = self.lextokens
c.lexignore = self.lexignore
- c.debug = self.debug
+ c.lexdebug = self.lexdebug
c.lineno = self.lineno
- c.optimize = self.optimize
- c.token = c.realtoken
- return c
+ c.lexoptimize = self.lexoptimize
+ c.lexliterals = self.lexliterals
+ c.lexmodule = self.lexmodule
+
+ # If the object parameter has been supplied, it means we are attaching the
+ # lexer to a new object. In this case, we have to rebind the methods
+
+ if object:
+ newre = []
+ for cre, findex in c.lexre:
+ # Loop over findex and adjust methods
+ newfindex = []
+ for f in findex:
+ if not f or not f[0]:
+ newfindex.append(f)
+ continue
+ newfindex.append((getattr(object,f[0].__name__),f[1]))
+ newre.append((cre,newfindex))
+ c.lexre = newre
+ c.lexmodule = object
+
+ return c
# ------------------------------------------------------------
# input() - Push a new string into the lexer
@@ -266,33 +275,21 @@ class Lexer:
self.lexdata = s
self.lexpos = 0
self.lexlen = len(s)
- self.token = self.realtoken
-
- # Change the token routine to point to realtoken()
- global token
- if token == self.errtoken:
- token = self.token
# ------------------------------------------------------------
- # errtoken() - Return error if token is called with no data
- # ------------------------------------------------------------
- def errtoken(self):
- raise RuntimeError, "No input string given with input()"
-
- # ------------------------------------------------------------
# token() - Return the next token from the Lexer
#
# Note: This function has been carefully implemented to be as fast
# as possible. Don't make changes unless you really know what
# you are doing
# ------------------------------------------------------------
- def realtoken(self):
+ def token(self):
# Make local copies of frequently referenced attributes
lexpos = self.lexpos
lexlen = self.lexlen
lexignore = self.lexignore
lexdata = self.lexdata
-
+
while lexpos < lexlen:
# This code provides some short-circuit code for whitespace, tabs, and other ignored characters
if lexdata[lexpos] in lexignore:
@@ -300,61 +297,78 @@ class Lexer:
continue
# Look for a regular expression match
- m = self.lexre.match(lexdata,lexpos)
- if m:
- i = m.lastindex
- lexpos = m.end()
+ for lexre,lexindexfunc in self.lexre:
+ m = lexre.match(lexdata,lexpos)
+ if not m: continue
+
tok = LexToken()
tok.value = m.group()
tok.lineno = self.lineno
+ tok.lexpos = lexpos
tok.lexer = self
- func,tok.type = self.lexindexfunc[i]
+
+ lexpos = m.end()
+ i = m.lastindex
+ func,tok.type = lexindexfunc[i]
+ self.lexpos = lexpos
+
if not func:
- self.lexpos = lexpos
return tok
# If token is processed by a function, call it
- self.lexpos = lexpos
newtok = func(tok)
- self.lineno = tok.lineno # Update line number
# Every function must return a token, if nothing, we just move to next token
- if not newtok: continue
+ if not newtok:
+ lexpos = self.lexpos # This is here in case user has updated lexpos.
+ break
# Verify type of the token. If not in the token map, raise an error
- if not self.optimize:
+ if not self.lexoptimize:
if not self.lextokens.has_key(newtok.type):
raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
func.func_code.co_filename, func.func_code.co_firstlineno,
func.__name__, newtok.type),lexdata[lexpos:])
return newtok
-
- # No match. Call t_error() if defined.
- if self.lexerrorf:
- tok = LexToken()
- tok.value = self.lexdata[lexpos:]
- tok.lineno = self.lineno
- tok.type = "error"
- tok.lexer = self
- oldpos = lexpos
- newtok = self.lexerrorf(tok)
- lexpos += getattr(tok,"_skipn",0)
- if oldpos == lexpos:
- # Error method didn't change text position at all. This is an error.
+ else:
+ # No match, see if in literals
+ if lexdata[lexpos] in self.lexliterals:
+ tok = LexToken()
+ tok.value = lexdata[lexpos]
+ tok.lineno = self.lineno
+ tok.lexer = self
+ tok.type = tok.value
+ tok.lexpos = lexpos
+ self.lexpos = lexpos + 1
+ return tok
+
+ # No match. Call t_error() if defined.
+ if self.lexerrorf:
+ tok = LexToken()
+ tok.value = self.lexdata[lexpos:]
+ tok.lineno = self.lineno
+ tok.type = "error"
+ tok.lexer = self
+ tok.lexpos = lexpos
+ oldpos = lexpos
+ newtok = self.lexerrorf(tok)
+ lexpos += getattr(tok,"_skipn",0)
+ if oldpos == lexpos:
+ # Error method didn't change text position at all. This is an error.
+ self.lexpos = lexpos
+ raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
+ if not newtok: continue
self.lexpos = lexpos
- raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
- if not newtok: continue
- self.lexpos = lexpos
- return newtok
+ return newtok
- self.lexpos = lexpos
- raise LexError, ("No match found", lexdata[lexpos:])
+ self.lexpos = lexpos
+ raise LexError, ("No match found", lexdata[lexpos:])
- # No more input data
self.lexpos = lexpos + 1
+ if self.lexdata is None:
+ raise RuntimeError, "No input string given with input()"
return None
-
# -----------------------------------------------------------------------------
# validate_file()
@@ -404,34 +418,74 @@ def validate_file(filename):
def _read_lextab(lexer, fdict, module):
exec "import %s as lextab" % module
- lexer.lexre = re.compile(lextab._lexre, re.VERBOSE | lextab._lexreflags)
+ lexre = []
+ for regex,ltab in lextab._lexre:
+ ftab = []
+ for t in ltab:
+ if t and t[0]:
+ ftab.append((fdict[t[0]], t[1]))
+ else:
+ ftab.append(t)
+ lexre.append((re.compile(regex, re.VERBOSE | lextab._lexreflags), ftab))
+ lexer.lexre = lexre
lexer.lexreflags = lextab._lexreflags
- lexer.lexindexfunc = lextab._lextab
- for i in range(len(lextab._lextab)):
- t = lexer.lexindexfunc[i]
- if t:
- if t[0]:
- lexer.lexindexfunc[i] = (fdict[t[0]],t[1])
lexer.lextokens = lextab._lextokens
lexer.lexignore = lextab._lexignore
if lextab._lexerrorf:
lexer.lexerrorf = fdict[lextab._lexerrorf]
-
+ lexer.lexliterals = lextab._lexliterals
+
+# -----------------------------------------------------------------------------
+# _form_master_re()
+#
+# This function takes a list of all of the regex components and attempts to
+# form the master regular expression. Given limitations in the Python re
+# module, it may be necessary to break the master regex into separate expressions.
+# -----------------------------------------------------------------------------
+
+def _form_master_re(relist,reflags,ldict):
+ if not relist: return []
+ regex = "|".join(relist)
+ try:
+ lexre = re.compile(regex,re.VERBOSE | reflags)
+
+ # Build the index to function map for the matching engine
+ lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1)
+ for f,i in lexre.groupindex.items():
+ handle = ldict.get(f,None)
+ if type(handle) in (types.FunctionType, types.MethodType):
+ lexindexfunc[i] = (handle,handle.__name__[2:])
+ elif handle is not None:
+ # If rule was specified as a string, we build an anonymous
+ # callback function to carry out the action
+ lexindexfunc[i] = (None,f[2:])
+
+ return [(lexre,lexindexfunc)],[regex]
+ except Exception,e:
+ m = int(len(relist)/2)
+ if m == 0: m = 1
+ llist, lre = _form_master_re(relist[:m],reflags,ldict)
+ rlist, rre = _form_master_re(relist[m:],reflags,ldict)
+ return llist+rlist, lre+rre
+
# -----------------------------------------------------------------------------
# lex(module)
#
# Build all of the regular expression rules from definitions in the supplied module
# -----------------------------------------------------------------------------
-def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
+def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0):
+ global lexer
ldict = None
- regex = ""
+ regex_list = []
error = 0
files = { }
- lexer = Lexer()
- lexer.debug = debug
- lexer.optimize = optimize
+ lexobj = Lexer()
+ lexobj.lexdebug = debug
+ lexobj.lexoptimize = optimize
global token,input
+ if object: module = object
+
if module:
# User supplied a module object.
if isinstance(module, types.ModuleType):
@@ -443,6 +497,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
ldict[i] = v
else:
raise ValueError,"Expected a module or instance"
+ lexobj.lexmodule = module
else:
# No module given. We might be able to get information from the caller.
@@ -456,11 +511,12 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
if optimize and lextab:
try:
- _read_lextab(lexer,ldict, lextab)
- if not lexer.lexignore: lexer.lexignore = ""
- token = lexer.token
- input = lexer.input
- return lexer
+ _read_lextab(lexobj,ldict, lextab)
+ if not lexobj.lexignore: lexobj.lexignore = ""
+ token = lexobj.token
+ input = lexobj.input
+ lexer = lexobj
+ return lexobj
except ImportError:
pass
@@ -480,7 +536,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
raise SyntaxError,"lex: tokens must be a list or tuple."
# Build a dictionary of valid token names
- lexer.lextokens = { }
+ lexobj.lextokens = { }
if not optimize:
# Utility function for verifying tokens
@@ -493,15 +549,26 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
if not is_identifier(n):
print "lex: Bad token name '%s'" % n
error = 1
- if lexer.lextokens.has_key(n):
+ if lexobj.lextokens.has_key(n):
print "lex: Warning. Token '%s' multiply defined." % n
- lexer.lextokens[n] = None
+ lexobj.lextokens[n] = None
else:
- for n in tokens: lexer.lextokens[n] = None
+ for n in tokens: lexobj.lextokens[n] = None
if debug:
- print "lex: tokens = '%s'" % lexer.lextokens.keys()
+ print "lex: tokens = '%s'" % lexobj.lextokens.keys()
+
+ # Get literals
+ if (module and isinstance(module,_INSTANCETYPE)):
+ literals = getattr(module,"literals","")
+ else:
+ try:
+ literals = ldict["literals"]
+ except KeyError:
+ literals = ""
+
+ lexobj.lexliterals = literals
# Get a list of symbols with the t_ prefix
tsymbols = [f for f in ldict.keys() if f[:2] == 't_']
@@ -559,15 +626,21 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
continue
if f.__name__ == 't_error':
- lexer.lexerrorf = f
+ lexobj.lexerrorf = f
continue
if f.__doc__:
if not optimize:
try:
- c = re.compile(f.__doc__, re.VERBOSE | reflags)
+ c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags)
+ if c.match(""):
+ print "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__)
+ error = 1
+ continue
except re.error,e:
print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)
+ if '#' in f.__doc__:
+ print "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__)
error = 1
continue
@@ -577,8 +650,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
# Okay. The regular expression seemed okay. Let's append it to the master regular
# expression we're building
- if (regex): regex += "|"
- regex += "(?P<%s>%s)" % (f.__name__,f.__doc__)
+ regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__))
else:
print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)
@@ -586,7 +658,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
for name,r in ssymbols:
if name == 't_ignore':
- lexer.lexignore = r
+ lexobj.lexignore = r
continue
if not optimize:
@@ -595,41 +667,38 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
error = 1
continue
- if not lexer.lextokens.has_key(name[2:]):
+ if not lexobj.lextokens.has_key(name[2:]):
print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:])
error = 1
continue
try:
- c = re.compile(r,re.VERBOSE)
+ c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags)
+ if (c.match("")):
+ print "lex: Regular expression for rule '%s' matches empty string." % name
+ error = 1
+ continue
except re.error,e:
print "lex: Invalid regular expression for rule '%s'. %s" % (name,e)
+ if '#' in r:
+ print "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name
+
error = 1
continue
if debug:
print "lex: Adding rule %s -> '%s'" % (name,r)
- if regex: regex += "|"
- regex += "(?P<%s>%s)" % (name,r)
+ regex_list.append("(?P<%s>%s)" % (name,r))
if not optimize:
for f in files.keys():
if not validate_file(f):
error = 1
try:
- if debug:
- print "lex: regex = '%s'" % regex
- lexer.lexre = re.compile(regex, re.VERBOSE | reflags)
- # Build the index to function map for the matching engine
- lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1)
- for f,i in lexer.lexre.groupindex.items():
- handle = ldict[f]
- if type(handle) in (types.FunctionType, types.MethodType):
- lexer.lexindexfunc[i] = (handle,handle.__name__[2:])
- else:
- # If rule was specified as a string, we build an anonymous
- # callback function to carry out the action
- lexer.lexindexfunc[i] = (None,f[2:])
+ lexobj.lexre, re_groups = _form_master_re(regex_list,reflags,ldict)
+ if debug:
+ for i in range(len(re_groups)):
+ print "lex: regex[%d] = '%s'" % (i, re_groups[i])
# If a lextab was specified, we create a file containing the precomputed
# regular expression and index table
@@ -637,26 +706,32 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
if lextab and optimize:
lt = open(lextab+".py","w")
lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab)
- lt.write("_lexre = %s\n" % repr(regex))
+
+ tabre = []
+ for i in range(len(re_groups)):
+ indexf = []
+ for t in lexobj.lexre[i][1]:
+ if t:
+ if t[0]:
+ indexf.append((t[0].__name__,t[1]))
+ else:
+ indexf.append((None,t[1]))
+ else:
+ indexf.append(None)
+ tabre.append((re_groups[i],indexf))
+
+ lt.write("_lexre = %s\n" % repr(tabre))
+
+ # Create an alternative lexre representation
+
lt.write("_lexreflags = %d\n" % reflags)
- lt.write("_lextab = [\n");
- for i in range(0,len(lexer.lexindexfunc)):
- t = lexer.lexindexfunc[i]
- if t:
- if t[0]:
- lt.write(" ('%s',%s),\n"% (t[0].__name__, repr(t[1])))
- else:
- lt.write(" (None,%s),\n" % repr(t[1]))
- else:
- lt.write(" None,\n")
-
- lt.write("]\n");
- lt.write("_lextokens = %s\n" % repr(lexer.lextokens))
- lt.write("_lexignore = %s\n" % repr(lexer.lexignore))
- if (lexer.lexerrorf):
- lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__))
+ lt.write("_lextokens = %s\n" % repr(lexobj.lextokens))
+ lt.write("_lexignore = %s\n" % repr(lexobj.lexignore))
+ if (lexobj.lexerrorf):
+ lt.write("_lexerrorf = %s\n" % repr(lexobj.lexerrorf.__name__))
else:
lt.write("_lexerrorf = None\n")
+ lt.write("_lexliterals = %s\n" % repr(lexobj.lexliterals))
lt.close()
except re.error,e:
@@ -664,16 +739,17 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab",reflags=0):
error = 1
if error:
raise SyntaxError,"lex: Unable to build lexer."
- if not lexer.lexerrorf:
+ if not lexobj.lexerrorf:
print "lex: Warning. no t_error rule is defined."
- if not lexer.lexignore: lexer.lexignore = ""
+ if not lexobj.lexignore: lexobj.lexignore = ""
# Create global versions of the token() and input() functions
- token = lexer.token
- input = lexer.input
+ token = lexobj.token
+ input = lexobj.input
+ lexer = lexobj
- return lexer
+ return lexobj
# -----------------------------------------------------------------------------
# run()
@@ -705,8 +781,22 @@ def runmain(lexer=None,data=None):
while 1:
tok = _token()
if not tok: break
- print "(%s,%r,%d)" % (tok.type, tok.value, tok.lineno)
+ print "(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos)
+
+# -----------------------------------------------------------------------------
+# @TOKEN(regex)
+#
+# This decorator function can be used to set the regex expression on a function
+# when its docstring might need to be set in an alternative way
+# -----------------------------------------------------------------------------
+
+def TOKEN(r):
+ def set_doc(f):
+ f.__doc__ = r
+ return f
+ return set_doc
+
diff --git a/tools/yacc.py b/tools/yacc.py
index cc5840c139..bd63d8d949 100755
--- a/tools/yacc.py
+++ b/tools/yacc.py
@@ -5,8 +5,6 @@
#
# Copyright (C) 2001-2006, David M. Beazley
#
-# $Header: /cvs/projects/PLY/yacc.py,v 1.6 2004/05/26 20:51:34 beazley Exp $
-#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
@@ -25,8 +23,10 @@
#
#
# This implements an LR parser that is constructed from grammar rules defined
-# as Python functions. Roughly speaking, this module is a cross between
-# John Aycock's Spark system and the GNU bison utility.
+# as Python functions. The grammer is specified by supplying the BNF inside
+# Python documentation strings. The inspiration for this technique was borrowed
+# from John Aycock's Spark parsing system. PLY might be viewed as cross between
+# Spark and the GNU bison utility.
#
# The current implementation is only somewhat object-oriented. The
# LR parser itself is defined in terms of an object (which allows multiple
@@ -36,8 +36,10 @@
# time using threads (in which case they should have their head examined).
#
# This implementation supports both SLR and LALR(1) parsing. LALR(1)
-# support was implemented by Elias Ioup (ezioup@alumni.uchicago.edu)
-# and hacked abit by Dave to run faster.
+# support was originally implemented by Elias Ioup (ezioup@alumni.uchicago.edu),
+# using the algorithm found in Aho, Sethi, and Ullman "Compilers: Principles,
+# Techniques, and Tools" (The Dragon Book). LALR(1) has since been replaced
+# by the more efficient DeRemer and Pennello algorithm.
#
# :::::::: WARNING :::::::
#
@@ -48,7 +50,7 @@
# own risk!
# ----------------------------------------------------------------------------
-__version__ = "1.8"
+__version__ = "2.1"
import types
@@ -63,7 +65,7 @@ yaccdebug = 1 # Debugging mode. If set, yacc generates a
debug_file = 'parser.out' # Default name of the debugging file
tab_module = 'parsetab' # Default name of the table module
-default_lr = 'SLR' # Default LR table generation method
+default_lr = 'LALR' # Default LR table generation method
error_count = 3 # Number of symbols that must be shifted to leave recovery mode
@@ -100,13 +102,15 @@ class YaccSymbol:
# for a symbol.
class YaccProduction:
- def __init__(self,s):
+ def __init__(self,s,stack=None):
self.slice = s
self.pbstack = []
+ self.stack = stack
def __getitem__(self,n):
if type(n) == types.IntType:
- return self.slice[n].value
+ if n >= 0: return self.slice[n].value
+ else: return self.stack[n].value
else:
return [s.value for s in self.slice[n.start:n.stop:n.step]]
@@ -161,7 +165,7 @@ class Parser:
del self.statestack[:]
del self.symstack[:]
sym = YaccSymbol()
- sym.type = '$'
+ sym.type = '$end'
self.symstack.append(sym)
self.statestack.append(0)
@@ -177,7 +181,8 @@ class Parser:
# If no lexer was given, we will try to use the lex module
if not lexer:
- import lex as lexer
+ import lex
+ lexer = lex.lexer
pslice.lexer = lexer
@@ -193,12 +198,13 @@ class Parser:
symstack = [ ] # Stack of grammar symbols
self.symstack = symstack
+ pslice.stack = symstack # Put in the production
errtoken = None # Err token
- # The start state is assumed to be (0,$)
+ # The start state is assumed to be (0,$end)
statestack.append(0)
sym = YaccSymbol()
- sym.type = '$'
+ sym.type = '$end'
symstack.append(sym)
while 1:
@@ -214,7 +220,7 @@ class Parser:
lookahead = lookaheadstack.pop()
if not lookahead:
lookahead = YaccSymbol()
- lookahead.type = '$'
+ lookahead.type = '$end'
if debug:
errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()
@@ -225,14 +231,10 @@ class Parser:
if debug > 1:
print 'action', t
- if t is None:
- t = actions.get((s,'#'),None)
- if debug > 1:
- print 'default action', t
if t is not None:
if t > 0:
# shift a symbol on the stack
- if ltype == '$':
+ if ltype == '$end':
# Error, end of input
sys.stderr.write("yacc: Parse error. EOF\n")
return
@@ -311,7 +313,7 @@ class Parser:
if not self.errorcount:
self.errorcount = error_count
errtoken = lookahead
- if errtoken.type == '$':
+ if errtoken.type == '$end':
errtoken = None # End of file!
if self.errorfunc:
global errok,token,restart
@@ -347,7 +349,7 @@ class Parser:
# entire parse has been rolled back and we're completely hosed. The token is
# discarded and we just keep going.
- if len(statestack) <= 1 and lookahead.type != '$':
+ if len(statestack) <= 1 and lookahead.type != '$end':
lookahead = None
errtoken = None
# Nuke the pushback stack
@@ -358,7 +360,7 @@ class Parser:
# at the end of the file. nuke the top entry and generate an error token
# Start nuking entries on the stack
- if lookahead.type == '$':
+ if lookahead.type == '$end':
# Whoa. We're really hosed here. Bail out
return
@@ -465,9 +467,6 @@ def initialize_vars():
global Nonterminals, First, Follow, Precedence, LRitems
global Errorfunc, Signature, Requires
- # LALR(1) globals
- global Prodempty, TReductions, NTReductions, GotoSetNum, Canonical
-
Productions = [None] # A list of all of the productions. The first
# entry is always reserved for the purpose of
# building an augmented grammar
@@ -502,22 +501,6 @@ def initialize_vars():
Requires = { } # Requires list
- # LALR(1) Initialization
- Prodempty = { } # A dictionary of all productions that have an empty rule
- # of the form P : <empty>
-
- TReductions = { } # A dictionary of precomputer reductions from
- # nonterminals to terminals
-
- NTReductions = { } # A dictionary of precomputed reductions from
- # nonterminals to nonterminals
-
- GotoSetNum = { } # A dictionary that remembers goto sets based on
- # the state number and symbol
-
- Canonical = { } # A list of LR item sets. A LR item set is a list of LR
- # items that represent the state of the parser
-
# File objects used when creating the parser.out debugging file
global _vf, _vfc
_vf = cStringIO.StringIO()
@@ -635,7 +618,20 @@ def add_production(f,file,line,prodname,syms):
sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname))
return -1
- for s in syms:
+ for x in range(len(syms)):
+ s = syms[x]
+ if s[0] in "'\"":
+ try:
+ c = eval(s)
+ if (len(c) > 1):
+ sys.stderr.write("%s:%d: Literal token %s in rule '%s' may only be a single character\n" % (file,line,s, prodname))
+ return -1
+ if not Terminals.has_key(c):
+ Terminals[c] = []
+ syms[x] = c
+ continue
+ except SyntaxError:
+ pass
if not is_identifier(s) and s != '%prec':
sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname))
return -1
@@ -766,8 +762,12 @@ def add_function(f):
if assign != ':' and assign != '::=':
sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline))
return -1
+
+
e = add_production(f,file,dline,prodname,syms)
error += e
+
+
except StandardError:
sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps))
error -= 1
@@ -823,7 +823,7 @@ def compute_terminates():
for t in Terminals.keys():
Terminates[t] = 1
- Terminates['$'] = 1
+ Terminates['$end'] = 1
# Nonterminals:
@@ -1064,14 +1064,14 @@ def first(beta):
# that might follow it. Dragon book, p. 189.
def compute_follow(start=None):
- # Add '$' to the follow list of the start symbol
+ # Add '$end' to the follow list of the start symbol
for k in Nonterminals.keys():
Follow[k] = [ ]
if not start:
start = Productions[1].name
- Follow[start] = [ '$' ]
+ Follow[start] = [ '$end' ]
while 1:
didadd = 0
@@ -1113,7 +1113,7 @@ def compute_first1():
for t in Terminals.keys():
First[t] = [t]
- First['$'] = ['$']
+ First['$end'] = ['$end']
First['#'] = ['#'] # what's this for?
# Nonterminals:
@@ -1215,55 +1215,16 @@ def lr0_goto(I,x):
s[id(n)] = s1
gs.append(n)
s = s1
- g = s.get('$',None)
+ g = s.get('$end',None)
if not g:
if gs:
g = lr0_closure(gs)
- s['$'] = g
+ s['$end'] = g
else:
- s['$'] = gs
+ s['$end'] = gs
_lr_goto_cache[(id(I),x)] = g
return g
-# Added for LALR(1)
-
-# Given a setnumber of an lr0 state and a symbol return the setnumber of the goto state
-def lr0_goto_setnumber(I_setnumber, x):
- global Canonical
- global GotoSetNum
-
- if GotoSetNum.has_key((I_setnumber, x)):
- setnumber = GotoSetNum[(I_setnumber, x)]
- else:
- gset = lr0_goto(Canonical[I_setnumber], x)
- if not gset:
- return -1
- else:
- gsetlen = len(gset)
- for i in xrange(len(gset[0].setnumbers)):
- inall = 1
- for item in gset:
- if not item.setnumbers[i]:
- inall = 0
- break
- if inall and len(Canonical[i]) == gsetlen:
- setnumber = i
- break # Note: DB. I added this to improve performance.
- # Not sure if this breaks the algorithm (it doesn't appear to).
-
- GotoSetNum[(I_setnumber, x)] = setnumber
-
- return setnumber
-
-# Compute the kernel of a set of LR(0) items
-def lr0_kernel(I):
- KI = [ ]
- for p in I:
- if p.name == "S'" or p.lr_index > 0 or p.len == 0:
- KI.append(p)
-
- return KI
-
_lr0_cidhash = { }
# Compute the LR(0) sets of item function
@@ -1297,468 +1258,382 @@ def lr0_items():
return C
# -----------------------------------------------------------------------------
-# slr_parse_table()
+# ==== LALR(1) Parsing ====
+#
+# LALR(1) parsing is almost exactly the same as SLR except that instead of
+# relying upon Follow() sets when performing reductions, a more selective
+# lookahead set that incorporates the state of the LR(0) machine is utilized.
+# Thus, we mainly just have to focus on calculating the lookahead sets.
+#
+# The method used here is due to DeRemer and Pennelo (1982).
+#
+# DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1)
+# Lookahead Sets", ACM Transactions on Programming Languages and Systems,
+# Vol. 4, No. 4, Oct. 1982, pp. 615-649
#
-# This function constructs an SLR table.
+# Further details can also be found in:
+#
+# J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing",
+# McGraw-Hill Book Company, (1985).
+#
+# Note: This implementation is a complete replacement of the LALR(1)
+# implementation in PLY-1.x releases. That version was based on
+# a less efficient algorithm and it had bugs in its implementation.
# -----------------------------------------------------------------------------
-def slr_parse_table():
- global _lr_method
- goto = _lr_goto # Goto array
- action = _lr_action # Action array
- actionp = { } # Action production array (temporary)
-
- _lr_method = "SLR"
-
- n_srconflict = 0
- n_rrconflict = 0
- if yaccdebug:
- sys.stderr.write("yacc: Generating SLR parsing table...\n")
- _vf.write("\n\nParsing method: SLR\n\n")
-
- # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items
- # This determines the number of states
-
- C = lr0_items()
+# -----------------------------------------------------------------------------
+# compute_nullable_nonterminals()
+#
+# Creates a dictionary containing all of the non-terminals that might produce
+# an empty production.
+# -----------------------------------------------------------------------------
- # Build the parser table, state by state
- st = 0
- for I in C:
- # Loop over each production in I
- actlist = [ ] # List of actions
-
- if yaccdebug:
- _vf.write("\nstate %d\n\n" % st)
- for p in I:
- _vf.write(" (%d) %s\n" % (p.number, str(p)))
- _vf.write("\n")
+def compute_nullable_nonterminals():
+ nullable = {}
+ num_nullable = 0
+ while 1:
+ for p in Productions[1:]:
+ if p.len == 0:
+ nullable[p.name] = 1
+ continue
+ for t in p.prod:
+ if not nullable.has_key(t): break
+ else:
+ nullable[p.name] = 1
+ if len(nullable) == num_nullable: break
+ num_nullable = len(nullable)
+ return nullable
- for p in I:
- try:
- if p.prod[-1] == ".":
- if p.name == "S'":
- # Start symbol. Accept!
- action[st,"$"] = 0
- actionp[st,"$"] = p
- else:
- # We are at the end of a production. Reduce!
- for a in Follow[p.name]:
- actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p)))
- r = action.get((st,a),None)
- if r is not None:
- # Whoa. Have a shift/reduce or reduce/reduce conflict
- if r > 0:
- # Need to decide on shift or reduce here
- # By default we favor shifting. Need to add
- # some precedence rules here.
- sprec,slevel = Productions[actionp[st,a].number].prec
- rprec,rlevel = Precedence.get(a,('right',0))
- if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')):
- # We really need to reduce here.
- action[st,a] = -p.number
- actionp[st,a] = p
- if not slevel and not rlevel:
- _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a)
- n_srconflict += 1
- elif (slevel == rlevel) and (rprec == 'nonassoc'):
- action[st,a] = None
- else:
- # Hmmm. Guess we'll keep the shift
- if not slevel and not rlevel:
- _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a)
- n_srconflict +=1
- elif r < 0:
- # Reduce/reduce conflict. In this case, we favor the rule
- # that was defined first in the grammar file
- oldp = Productions[-r]
- pp = Productions[p.number]
- if oldp.line > pp.line:
- action[st,a] = -p.number
- actionp[st,a] = p
- # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st)
- n_rrconflict += 1
- _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a]))
- _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a]))
- else:
- sys.stderr.write("Unknown conflict in state %d\n" % st)
- else:
- action[st,a] = -p.number
- actionp[st,a] = p
- else:
- i = p.lr_index
- a = p.prod[i+1] # Get symbol right after the "."
- if Terminals.has_key(a):
- g = lr0_goto(I,a)
- j = _lr0_cidhash.get(id(g),-1)
- if j >= 0:
- # We are in a shift state
- actlist.append((a,p,"shift and go to state %d" % j))
- r = action.get((st,a),None)
- if r is not None:
- # Whoa have a shift/reduce or shift/shift conflict
- if r > 0:
- if r != j:
- sys.stderr.write("Shift/shift conflict in state %d\n" % st)
- elif r < 0:
- # Do a precedence check.
- # - if precedence of reduce rule is higher, we reduce.
- # - if precedence of reduce is same and left assoc, we reduce.
- # - otherwise we shift
- rprec,rlevel = Productions[actionp[st,a].number].prec
- sprec,slevel = Precedence.get(a,('right',0))
- if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')):
- # We decide to shift here... highest precedence to shift
- action[st,a] = j
- actionp[st,a] = p
- if not slevel and not rlevel:
- n_srconflict += 1
- _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a)
- elif (slevel == rlevel) and (rprec == 'nonassoc'):
- action[st,a] = None
- else:
- # Hmmm. Guess we'll keep the reduce
- if not slevel and not rlevel:
- n_srconflict +=1
- _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a)
-
- else:
- sys.stderr.write("Unknown conflict in state %d\n" % st)
- else:
- action[st,a] = j
- actionp[st,a] = p
-
- except StandardError,e:
- raise YaccError, "Hosed in slr_parse_table", e
+# -----------------------------------------------------------------------------
+# find_nonterminal_trans(C)
+#
+# Given a set of LR(0) items, this functions finds all of the non-terminal
+# transitions. These are transitions in which a dot appears immediately before
+# a non-terminal. Returns a list of tuples of the form (state,N) where state
+# is the state number and N is the nonterminal symbol.
+#
+# The input C is the set of LR(0) items.
+# -----------------------------------------------------------------------------
- # Print the actions associated with each terminal
- if yaccdebug:
- _actprint = { }
- for a,p,m in actlist:
- if action.has_key((st,a)):
- if p is actionp[st,a]:
- _vf.write(" %-15s %s\n" % (a,m))
- _actprint[(a,m)] = 1
- _vf.write("\n")
- for a,p,m in actlist:
- if action.has_key((st,a)):
- if p is not actionp[st,a]:
- if not _actprint.has_key((a,m)):
- _vf.write(" ! %-15s [ %s ]\n" % (a,m))
- _actprint[(a,m)] = 1
-
- # Construct the goto table for this state
- if yaccdebug:
- _vf.write("\n")
- nkeys = { }
- for ii in I:
- for s in ii.usyms:
- if Nonterminals.has_key(s):
- nkeys[s] = None
- for n in nkeys.keys():
- g = lr0_goto(I,n)
- j = _lr0_cidhash.get(id(g),-1)
- if j >= 0:
- goto[st,n] = j
- if yaccdebug:
- _vf.write(" %-30s shift and go to state %d\n" % (n,j))
+def find_nonterminal_transitions(C):
+ trans = []
+ for state in range(len(C)):
+ for p in C[state]:
+ if p.lr_index < p.len - 1:
+ t = (state,p.prod[p.lr_index+1])
+ if Nonterminals.has_key(t[1]):
+ if t not in trans: trans.append(t)
+ state = state + 1
+ return trans
- st += 1
+# -----------------------------------------------------------------------------
+# dr_relation()
+#
+# Computes the DR(p,A) relationships for non-terminal transitions. The input
+# is a tuple (state,N) where state is a number and N is a nonterminal symbol.
+#
+# Returns a list of terminals.
+# -----------------------------------------------------------------------------
- if yaccdebug:
- if n_srconflict == 1:
- sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict)
- if n_srconflict > 1:
- sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict)
- if n_rrconflict == 1:
- sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict)
- if n_rrconflict > 1:
- sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict)
+def dr_relation(C,trans,nullable):
+ dr_set = { }
+ state,N = trans
+ terms = []
+ g = lr0_goto(C[state],N)
+ for p in g:
+ if p.lr_index < p.len - 1:
+ a = p.prod[p.lr_index+1]
+ if Terminals.has_key(a):
+ if a not in terms: terms.append(a)
+ # This extra bit is to handle the start state
+ if state == 0 and N == Productions[0].prod[0]:
+ terms.append('$end')
+
+ return terms
# -----------------------------------------------------------------------------
-# ==== LALR(1) Parsing ====
-# FINISHED! 5/20/2003 by Elias Ioup
+# reads_relation()
+#
+# Computes the READS() relation (p,A) READS (t,C).
# -----------------------------------------------------------------------------
+def reads_relation(C, trans, empty):
+ # Look for empty transitions
+ rel = []
+ state, N = trans
-# Compute the lr1_closure of a set I. I is a list of productions and setnumber
-# is the state that you want the lr items that are made from the to come from.
+ g = lr0_goto(C[state],N)
+ j = _lr0_cidhash.get(id(g),-1)
+ for p in g:
+ if p.lr_index < p.len - 1:
+ a = p.prod[p.lr_index + 1]
+ if empty.has_key(a):
+ rel.append((j,a))
-_lr1_add_count = 0
+ return rel
-def lr1_closure(I, setnumber = 0):
- global _add_count
- global Nonterminals
+# -----------------------------------------------------------------------------
+# compute_lookback_includes()
+#
+# Determines the lookback and includes relations
+#
+# LOOKBACK:
+#
+# This relation is determined by running the LR(0) state machine forward.
+# For example, starting with a production "N : . A B C", we run it forward
+# to obtain "N : A B C ." We then build a relationship between this final
+# state and the starting state. These relationships are stored in a dictionary
+# lookdict.
+#
+# INCLUDES:
+#
+# Computes the INCLUDE() relation (p,A) INCLUDES (p',B).
+#
+# This relation is used to determine non-terminal transitions that occur
+# inside of other non-terminal transition states. (p,A) INCLUDES (p', B)
+# if the following holds:
+#
+# B -> LAT, where T -> epsilon and p' -L-> p
+#
+# L is essentially a prefix (which may be empty), T is a suffix that must be
+# able to derive an empty string. State p' must lead to state p with the string L.
+#
+# -----------------------------------------------------------------------------
- _add_count += 1
- prodlist = Productions
+def compute_lookback_includes(C,trans,nullable):
+
+ lookdict = {} # Dictionary of lookback relations
+ includedict = {} # Dictionary of include relations
- # Add everything in I to J
- J = I[:]
- Jhash = { }
- for j in J:
- Jhash[id(j)] = 1
+ # Make a dictionary of non-terminal transitions
+ dtrans = {}
+ for t in trans:
+ dtrans[t] = 1
+
+ # Loop over all transitions and compute lookbacks and includes
+ for state,N in trans:
+ lookb = []
+ includes = []
+ for p in C[state]:
+ if p.name != N: continue
- didadd = 1
- while didadd:
- didadd = 0
- for j in J:
- jprod = j.prod
- jlr_index = j.lr_index
- jprodslice = jprod[jlr_index+2:]
-
- if jlr_index < len(jprod) - 1 and Nonterminals.has_key(jprod[jlr_index+1]):
- first_syms = []
-
- if j.lk_added.setdefault(setnumber, 0) < len(j.lookaheads[setnumber]):
- for a in j.lookaheads[setnumber][j.lk_added[setnumber]:]:
- # find b in FIRST(Xa) if j = [A->a.BX,a]
- temp_first_syms = first(jprodslice + (a,))
- for x in temp_first_syms:
- if x not in first_syms:
- first_syms.append(x)
-
- j.lk_added[setnumber] = len(j.lookaheads[setnumber])
-
- for x in j.lrafter:
-
- # Add B --> .G to J
- if x.lr_next.lookaheads.has_key(setnumber):
- _xlook = x.lr_next.lookaheads[setnumber]
- for s in first_syms:
- if s not in _xlook:
- _xlook.append(s)
- didadd = 1
- else:
- x.lr_next.lookaheads[setnumber] = first_syms
- didadd = 1
-
- nid = id(x.lr_next)
- if not Jhash.has_key(nid):
- J.append(x.lr_next)
- Jhash[nid] = 1
-
- return J
+ # Okay, we have a name match. We now follow the production all the way
+ # through the state machine until we get the . on the right hand side
+
+ lr_index = p.lr_index
+ j = state
+ while lr_index < p.len - 1:
+ lr_index = lr_index + 1
+ t = p.prod[lr_index]
+
+ # Check to see if this symbol and state are a non-terminal transition
+ if dtrans.has_key((j,t)):
+ # Yes. Okay, there is some chance that this is an includes relation
+ # the only way to know for certain is whether the rest of the
+ # production derives empty
+
+ li = lr_index + 1
+ while li < p.len:
+ if Terminals.has_key(p.prod[li]): break # No forget it
+ if not nullable.has_key(p.prod[li]): break
+ li = li + 1
+ else:
+ # Appears to be a relation between (j,t) and (state,N)
+ includes.append((j,t))
+
+ g = lr0_goto(C[j],t) # Go to next set
+ j = _lr0_cidhash.get(id(g),-1) # Go to next state
+
+ # When we get here, j is the final state, now we have to locate the production
+ for r in C[j]:
+ if r.name != p.name: continue
+ if r.len != p.len: continue
+ i = 0
+ # This look is comparing a production ". A B C" with "A B C ."
+ while i < r.lr_index:
+ if r.prod[i] != p.prod[i+1]: break
+ i = i + 1
+ else:
+ lookb.append((j,r))
+ for i in includes:
+ if not includedict.has_key(i): includedict[i] = []
+ includedict[i].append((state,N))
+ lookdict[(state,N)] = lookb
+
+ return lookdict,includedict
-def add_lookaheads(K):
- spontaneous = []
- propogate = []
-
- for setnumber in range(len(K)):
- for kitem in K[setnumber]:
- kitem.lookaheads[setnumber] = ['#']
- J = lr1_closure([kitem], setnumber)
-
- # find the lookaheads that are spontaneously created from closures
- # and the propogations of lookaheads between lr items
- for item in J:
- if item.lr_index < len(item.prod)-1:
- for lookahead in item.lookaheads[setnumber]:
- goto_setnumber = lr0_goto_setnumber(setnumber, item.prod[item.lr_index+1])
- next = None
- if lookahead != '#':
- if item.lr_next in K[goto_setnumber]:
- next = item.lr_next
- if next:
- spontaneous.append((next, (lookahead, goto_setnumber)))
- else:
- if goto_setnumber > -1:
- if item.lr_next in K[goto_setnumber]:
- next = item.lr_next
-
- if next:
- propogate.append(((kitem, setnumber), (next, goto_setnumber)))
+# -----------------------------------------------------------------------------
+# digraph()
+# traverse()
+#
+# The following two functions are used to compute set valued functions
+# of the form:
+#
+# F(x) = F'(x) U U{F(y) | x R y}
+#
+# This is used to compute the values of Read() sets as well as FOLLOW sets
+# in LALR(1) generation.
+#
+# Inputs: X - An input set
+# R - A relation
+# FP - Set-valued function
+# ------------------------------------------------------------------------------
+
+def digraph(X,R,FP):
+ N = { }
+ for x in X:
+ N[x] = 0
+ stack = []
+ F = { }
+ for x in X:
+ if N[x] == 0: traverse(x,N,stack,F,X,R,FP)
+ return F
+
+def traverse(x,N,stack,F,X,R,FP):
+ stack.append(x)
+ d = len(stack)
+ N[x] = d
+ F[x] = FP(x) # F(X) <- F'(x)
+
+ rel = R(x) # Get y's related to x
+ for y in rel:
+ if N[y] == 0:
+ traverse(y,N,stack,F,X,R,FP)
+ N[x] = min(N[x],N[y])
+ for a in F.get(y,[]):
+ if a not in F[x]: F[x].append(a)
+ if N[x] == d:
+ N[stack[-1]] = sys.maxint
+ F[stack[-1]] = F[x]
+ element = stack.pop()
+ while element != x:
+ N[stack[-1]] = sys.maxint
+ F[stack[-1]] = F[x]
+ element = stack.pop()
-
+# -----------------------------------------------------------------------------
+# compute_read_sets()
+#
+# Given a set of LR(0) items, this function computes the read sets.
+#
+# Inputs: C = Set of LR(0) items
+# ntrans = Set of nonterminal transitions
+# nullable = Set of empty transitions
+#
+# Returns a set containing the read sets
+# -----------------------------------------------------------------------------
- for x in K[setnumber]:
- x.lookaheads[setnumber] = []
+def compute_read_sets(C, ntrans, nullable):
+ FP = lambda x: dr_relation(C,x,nullable)
+ R = lambda x: reads_relation(C,x,nullable)
+ F = digraph(ntrans,R,FP)
+ return F
- for x in spontaneous:
- if x[1][0] not in x[0].lookaheads[x[1][1]]:
- x[0].lookaheads[x[1][1]].append(x[1][0])
+# -----------------------------------------------------------------------------
+# compute_follow_sets()
+#
+# Given a set of LR(0) items, a set of non-terminal transitions, a readset,
+# and an include set, this function computes the follow sets
+#
+# Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)}
+#
+# Inputs:
+# ntrans = Set of nonterminal transitions
+# readsets = Readset (previously computed)
+# inclsets = Include sets (previously computed)
+#
+# Returns a set containing the follow sets
+# -----------------------------------------------------------------------------
- K[0][0].lookaheads[0] = ['$']
+def compute_follow_sets(ntrans,readsets,inclsets):
+ FP = lambda x: readsets[x]
+ R = lambda x: inclsets.get(x,[])
+ F = digraph(ntrans,R,FP)
+ return F
- pitems = {}
- for x in propogate:
- if pitems.has_key(x[0]):
- pitems[x[0]].append(x[1])
- else:
- pitems[x[0]] = []
- pitems[x[0]].append(x[1])
-
- # propogate the lookaheads that were spontaneously generated
- # based on the propogations produced above
- stop = 0
-
- while not stop:
- stop = 1
- kindex = 0
- for set in K:
- for item in set:
- pkey = (item, kindex)
- if pitems.has_key(pkey):
- for propogation in pitems[pkey]:
- gitem = propogation[0]
- gsetnumber = propogation[1]
- glookaheads = gitem.lookaheads[gsetnumber]
- for lookahead in item.lookaheads[kindex]:
- if lookahead not in glookaheads:
- glookaheads.append(lookahead)
- stop = 0
- kindex += 1
-
-def ReduceNonterminals():
- global Nonterminals
-
- global TReductions
- global NTReductions
-
- for nt in Nonterminals.keys():
- TReductions[nt] = []
- NTReductions[nt] = []
-
- for nt in Nonterminals.keys():
- terms = ReduceToTerminals(nt)
- TReductions[nt].extend(terms)
- if not NTReductions.has_key(nt):
- ReduceToNonterminals(nt)
-
+# -----------------------------------------------------------------------------
+# add_lookaheads()
+#
+# Attaches the lookahead symbols to grammar rules.
+#
+# Inputs: lookbacks - Set of lookback relations
+# followset - Computed follow set
+#
+# This function directly attaches the lookaheads to productions contained
+# in the lookbacks set
+# -----------------------------------------------------------------------------
+def add_lookaheads(lookbacks,followset):
+ for trans,lb in lookbacks.items():
+ # Loop over productions in lookback
+ for state,p in lb:
+ if not p.lookaheads.has_key(state):
+ p.lookaheads[state] = []
+ f = followset.get(trans,[])
+ for a in f:
+ if a not in p.lookaheads[state]: p.lookaheads[state].append(a)
+
+# -----------------------------------------------------------------------------
+# add_lalr_lookaheads()
+#
+# This function does all of the work of adding lookahead information for use
+# with LALR parsing
+# -----------------------------------------------------------------------------
-def ReduceToTerminals(nt,cyclemap=None):
- global Prodnames
- global Terminals
- reducedterminals = []
- if not cyclemap: cyclemap = {}
+def add_lalr_lookaheads(C):
+ # Determine all of the nullable nonterminals
+ nullable = compute_nullable_nonterminals()
- if cyclemap.has_key(nt): return []
- cyclemap[nt] = 1
+ # Find all non-terminal transitions
+ trans = find_nonterminal_transitions(C)
- for p in Prodnames[nt]:
- if len(p.prod) > 0:
- if Terminals.has_key(p.prod[0]):
- if p.prod[0] not in reducedterminals:
- reducedterminals.append(p.prod[0])
- else:
- if p.prod[0] != nt:
- terms = ReduceToTerminals(p.prod[0],cyclemap)
- for t in terms:
- if t not in reducedterminals:
- reducedterminals.append(t)
- del cyclemap[nt]
- return reducedterminals
+ # Compute read sets
+ readsets = compute_read_sets(C,trans,nullable)
-
-def ReduceToNonterminals(nt):
- global Prodnames
- global Nonterminals
- global NTReductions
- reducednonterminals = []
-
- for p in Prodnames[nt]:
- if len(p.prod) > 0:
- if Nonterminals.has_key(p.prod[0]):
- if p.prod[0] not in reducednonterminals:
- reducednonterminals.append(p.prod[0])
- if p.prod[0] != nt:
- if not NTReductions.has_key(p.prod[0]):
- ReduceToNonterminals(p.prod[0])
-
- nterms = NTReductions[p.prod[0]]
- for nt in nterms:
- if nt not in reducednonterminals:
- reducednonterminals.append(nt)
-
+ # Compute lookback/includes relations
+ lookd, included = compute_lookback_includes(C,trans,nullable)
- NTReductions[nt] = reducednonterminals
+ # Compute LALR FOLLOW sets
+ followsets = compute_follow_sets(trans,readsets,included)
+
+ # Add all of the lookaheads
+ add_lookaheads(lookd,followsets)
# -----------------------------------------------------------------------------
-# lalr_parse_table()
+# lr_parse_table()
#
-# This function constructs an LALR table.
+# This function constructs the parse tables for SLR or LALR
# -----------------------------------------------------------------------------
-def lalr_parse_table():
+def lr_parse_table(method):
global _lr_method
goto = _lr_goto # Goto array
action = _lr_action # Action array
actionp = { } # Action production array (temporary)
- goto_cache = _lr_goto_cache
- cid_hash = _lr0_cidhash
-
- _lr_method = "LALR"
+
+ _lr_method = method
n_srconflict = 0
n_rrconflict = 0
if yaccdebug:
- sys.stderr.write("yacc: Generating LALR(1) parsing table...\n")
- _vf.write("\n\nParsing method: LALR(1)\n\n")
+ sys.stderr.write("yacc: Generating %s parsing table...\n" % method)
+ _vf.write("\n\nParsing method: %s\n\n" % method)
# Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items
# This determines the number of states
-
+
C = lr0_items()
- global Canonical
- Canonical = C
-
- ###
- # Create the kernel states.
- ###
- K = []
- setC = [0]*len(C)
- for x in C:
- K.append(lr0_kernel(x))
- for y in x:
- y.setnumbers = setC[:]
-
- _cindex = 0
- for x in C:
- for y in x:
- y.lookaheads[_cindex] = []
- y.setnumbers[_cindex] = 1
- _cindex = _cindex + 1
-
- ###
- # Add lookaheads to the lr items
- ###
-
- add_lookaheads(K)
-
- ###
- # Do the reductions for parsing first and keep them in globals
- ###
-
- ReduceNonterminals()
-
- global TReductions
- global NTReductions
- global Prodempty
-
- EmptyAncestors = {}
- for y in Prodempty.keys():
- EmptyAncestors[y] = []
- for x in NTReductions.items():
- for y in x[1]:
- if Prodempty.has_key(y):
- EmptyAncestors[y].append(x[0])
-
+ if method == 'LALR':
+ add_lalr_lookaheads(C)
# Build the parser table, state by state
st = 0
for I in C:
# Loop over each production in I
actlist = [ ] # List of actions
- acthash = { }
-
- idI = id(I)
if yaccdebug:
_vf.write("\nstate %d\n\n" % st)
@@ -1766,86 +1641,20 @@ def lalr_parse_table():
_vf.write(" (%d) %s\n" % (p.number, str(p)))
_vf.write("\n")
- global First
for p in I:
try:
if p.prod[-1] == ".":
if p.name == "S'":
# Start symbol. Accept!
- action[st,"$"] = 0
- actionp[st,"$"] = p
- elif len(p.prod) == 0:
- ancestors = EmptyAncestors[p.name]
- for i in ancestors:
- for s in K:
- if i in s:
- input_list = []
- plist = Productions[i.name]
- for x in plist:
- if len(x.prod) > 0 and x.prod[0] == p.name:
- n = p.prod[1:]
- d = x.prod[lr_index+2:]
- for l in x.lookaheads.items():
- flist = First[tuple(n+d+[l])]
- for f in flist:
- if f not in input_list and f in p.lookaheads[st]:
- input_list.append(f)
-
- # We are at the end of a production. Reduce!
- #print "input_list: %s" % input_list
- #print "Follow[p.name]: %s" % Follow[p.name]
- for a in input_list:
- actlist.append((a,p,"reduce using rule %d (%s) " % (p.number,p)))
- r = action.get((st,a),None)
- if r is not None:
- # Whoa. Have a shift/reduce or reduce/reduce conflict
- if r > 0:
- # Need to decide on shift or reduce here
- # By default we favor shifting. Need to add
- # some precedence rules here.
- sprec,slevel = Productions[actionp[st,a].number].prec
- rprec,rlevel = Precedence.get(a,('right',0))
- if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')):
- # We really need to reduce here.
- action[st,a] = -p.number
- actionp[st,a] = p
- if not slevel and not rlevel:
- _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a)
- n_srconflict += 1
- elif (slevel == rlevel) and (rprec == 'nonassoc'):
- action[st,a] = None
- else:
- # Hmmm. Guess we'll keep the shift
- if not slevel and not rlevel:
- _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a)
- n_srconflict +=1
- elif r < 0:
- # Reduce/reduce conflict. In this case, we favor the rule
- # that was defined first in the grammar file
- oldp = Productions[-r]
- pp = Productions[p.number]
- if oldp.line > pp.line:
- action[st,a] = -p.number
- actionp[st,a] = p
- # print "Reduce/reduce conflict in state %d" % st
- n_rrconflict += 1
- _vfc.write("reduce/reduce conflict in state %d resolved using rule %d.\n" % (st, actionp[st,a].number))
- _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d.\n" % (a,actionp[st,a].number))
- else:
- sys.stderr.write("Unknown conflict in state %d\n" % st)
- else:
- action[st,a] = -p.number
- actionp[st,a] = p
-
- break # break out of the for s in K loop because we only want to make
- # sure that a production is in the Kernel
-
+ action[st,"$end"] = 0
+ actionp[st,"$end"] = p
else:
# We are at the end of a production. Reduce!
-
- for a in p.lookaheads[st]:
+ if method == 'LALR':
+ laheads = p.lookaheads[st]
+ else:
+ laheads = Follow[p.name]
+ for a in laheads:
actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p)))
r = action.get((st,a),None)
if r is not None:
@@ -1855,7 +1664,7 @@ def lalr_parse_table():
# By default we favor shifting. Need to add
# some precedence rules here.
sprec,slevel = Productions[actionp[st,a].number].prec
- rprec,rlevel = Precedence.get(a,('right',0))
+ rprec,rlevel = Precedence.get(a,('right',0))
if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')):
# We really need to reduce here.
action[st,a] = -p.number
@@ -1880,12 +1689,12 @@ def lalr_parse_table():
if oldp.line > pp.line:
action[st,a] = -p.number
actionp[st,a] = p
- # print "Reduce/reduce conflict in state %d" % st
+ # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st)
n_rrconflict += 1
- _vfc.write("reduce/reduce conflict in state %d resolved using rule %d.\n" % (st, actionp[st,a].number))
- _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d.\n" % (a,actionp[st,a].number))
+ _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a]))
+ _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a]))
else:
- print "Unknown conflict in state %d" % st
+ sys.stderr.write("Unknown conflict in state %d\n" % st)
else:
action[st,a] = -p.number
actionp[st,a] = p
@@ -1893,14 +1702,11 @@ def lalr_parse_table():
i = p.lr_index
a = p.prod[i+1] # Get symbol right after the "."
if Terminals.has_key(a):
- g = goto_cache[(idI,a)]
- j = cid_hash.get(id(g),-1)
+ g = lr0_goto(I,a)
+ j = _lr0_cidhash.get(id(g),-1)
if j >= 0:
# We are in a shift state
- _k = (a,j)
- if not acthash.has_key(_k):
- actlist.append((a,p,"shift and go to state %d" % j))
- acthash[_k] = 1
+ actlist.append((a,p,"shift and go to state %d" % j))
r = action.get((st,a),None)
if r is not None:
# Whoa have a shift/reduce or shift/shift conflict
@@ -1936,102 +1742,54 @@ def lalr_parse_table():
else:
action[st,a] = j
actionp[st,a] = p
- else:
- nonterminal = a
- term_list = TReductions[nonterminal]
- # DB: This loop gets executed a lot. Try to optimize
- for a in term_list:
- g = goto_cache[(idI,a)]
- j = cid_hash[id(g)]
- if j >= 0:
- # We are in a shift state
- # Don't put repeated shift actions on action list (performance hack)
- _k = (a,j)
- if not acthash.has_key(_k):
- actlist.append((a,p,"shift and go to state "+str(j)))
- acthash[_k] = 1
-
- r = action.get((st,a),None)
- if r is not None:
- # Whoa have a shift/reduce or shift/shift conflict
- if r > 0:
- if r != j:
- sys.stderr.write("Shift/shift conflict in state %d\n" % st)
- continue
- elif r < 0:
- # Do a precedence check.
- # - if precedence of reduce rule is higher, we reduce.
- # - if precedence of reduce is same and left assoc, we reduce.
- # - otherwise we shift
- rprec,rlevel = Productions[actionp[st,a].number].prec
- sprec,slevel = Precedence.get(a,('right',0))
- if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')):
- # We decide to shift here... highest precedence to shift
- action[st,a] = j
- actionp[st,a] = p
- if not slevel and not rlevel:
- n_srconflict += 1
- _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a)
- elif (slevel == rlevel) and (rprec == 'nonassoc'):
- action[st,a] = None
- else:
- # Hmmm. Guess we'll keep the reduce
- if not slevel and not rlevel:
- n_srconflict +=1
- _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a)
-
- else:
- sys.stderr.write("Unknown conflict in state %d\n" % st)
- else:
- action[st,a] = j
- actionp[st,a] = p
-
+
except StandardError,e:
- raise YaccError, "Hosed in lalr_parse_table", e
+ raise YaccError, "Hosed in lr_parse_table", e
# Print the actions associated with each terminal
if yaccdebug:
+ _actprint = { }
for a,p,m in actlist:
if action.has_key((st,a)):
if p is actionp[st,a]:
_vf.write(" %-15s %s\n" % (a,m))
+ _actprint[(a,m)] = 1
_vf.write("\n")
-
for a,p,m in actlist:
if action.has_key((st,a)):
if p is not actionp[st,a]:
- _vf.write(" ! %-15s [ %s ]\n" % (a,m))
+ if not _actprint.has_key((a,m)):
+ _vf.write(" ! %-15s [ %s ]\n" % (a,m))
+ _actprint[(a,m)] = 1
# Construct the goto table for this state
+ if yaccdebug:
+ _vf.write("\n")
nkeys = { }
for ii in I:
for s in ii.usyms:
if Nonterminals.has_key(s):
nkeys[s] = None
-
- # Construct the goto table for this state
for n in nkeys.keys():
g = lr0_goto(I,n)
- j = cid_hash.get(id(g),-1)
+ j = _lr0_cidhash.get(id(g),-1)
if j >= 0:
goto[st,n] = j
if yaccdebug:
_vf.write(" %-30s shift and go to state %d\n" % (n,j))
st += 1
+
if yaccdebug:
if n_srconflict == 1:
sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict)
if n_srconflict > 1:
- sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict)
+ sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict)
if n_rrconflict == 1:
sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict)
if n_rrconflict > 1:
sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict)
-
# -----------------------------------------------------------------------------
# ==== LR Utility functions ====
# -----------------------------------------------------------------------------
@@ -2145,6 +1903,7 @@ del _lr_goto_items
else:
f.write(" None,\n")
f.write("]\n")
+
f.close()
except IOError,e:
@@ -2193,9 +1952,6 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module,
files = { }
error = 0
- # Add starting symbol to signature
- if start:
- Signature.update(start)
# Add parsing method to signature
Signature.update(method)
@@ -2227,6 +1983,12 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module,
f = f.f_back # Walk out to our calling function
ldict = f.f_globals # Grab its globals dictionary
+ # Add starting symbol to signature
+ if not start:
+ start = ldict.get("start",None)
+ if start:
+ Signature.update(start)
+
# If running in optimized mode. We're going to
if (optimize and lr_read_tables(tabmodule,1)):
@@ -2374,10 +2136,8 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module,
compute_first1()
compute_follow(start)
- if method == 'SLR':
- slr_parse_table()
- elif method == 'LALR':
- lalr_parse_table()
+ if method in ['SLR','LALR']:
+ lr_parse_table(method)
else:
raise YaccError, "Unknown parsing method '%s'" % method
@@ -2408,6 +2168,9 @@ def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module,
global parse
parse = p.parse
+ global parser
+ parser = p
+
# Clean up all of the globals we created
if (not optimize):
yacc_cleanup()
@@ -2423,12 +2186,10 @@ def yacc_cleanup():
global Productions, Prodnames, Prodmap, Terminals
global Nonterminals, First, Follow, Precedence, LRitems
global Errorfunc, Signature, Requires
- global Prodempty, TReductions, NTReductions, GotoSetNum, Canonical
del Productions, Prodnames, Prodmap, Terminals
del Nonterminals, First, Follow, Precedence, LRitems
del Errorfunc, Signature, Requires
- del Prodempty, TReductions, NTReductions, GotoSetNum, Canonical
global _vf, _vfc
del _vf, _vfc