diff options
author | Tomas Kukosa <tomas.kukosa@siemens.com> | 2007-01-16 07:31:09 +0000 |
---|---|---|
committer | Tomas Kukosa <tomas.kukosa@siemens.com> | 2007-01-16 07:31:09 +0000 |
commit | 842bc977fcc4626f4a4ce018cdb6f9f736a31734 (patch) | |
tree | 2fc9470d29034359cbed1d14a23c8505557ebd25 /tools/lex.py | |
parent | 74ed71d648386ba9fced3846095100a52c2fae51 (diff) |
update Ply to version 2.2
svn path=/trunk/; revision=20448
Diffstat (limited to 'tools/lex.py')
-rwxr-xr-x | tools/lex.py | 814 |
1 files changed, 439 insertions, 375 deletions
diff --git a/tools/lex.py b/tools/lex.py index beaace02df..c1493665c3 100755 --- a/tools/lex.py +++ b/tools/lex.py @@ -20,159 +20,14 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # See the file COPYING for a complete copy of the LGPL. -# -# -# This module automatically constructs a lexical analysis module from regular -# expression rules defined in a user-defined module. The idea is essentially the same -# as that used in John Aycock's Spark framework, but the implementation works -# at the module level rather than requiring the use of classes. -# -# This module tries to provide an interface that is closely modeled after -# the traditional lex interface in Unix. It also differs from Spark -# in that: -# -# - It provides more extensive error checking and reporting if -# the user supplies a set of regular expressions that can't -# be compiled or if there is any other kind of a problem in -# the specification. -# -# - The interface is geared towards LALR(1) and LR(1) parser -# generators. That is tokens are generated one at a time -# rather than being generated in advanced all in one step. #----------------------------------------------------------------------------- -r""" -lex.py - -This module builds lex-like scanners based on regular expression rules. -To use the module, simply write a collection of regular expression rules -and actions like this: - -# lexer.py -import lex - -# Define a list of valid tokens -tokens = ( - 'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS' - ) - -# Define tokens as functions -def t_IDENTIFIER(t): - r' ([a-zA-Z_](\w|_)* ' - return t - -def t_NUMBER(t): - r' \d+ ' - return t - -# Some simple tokens with no actions -t_PLUS = r'\+' -t_MINUS = r'-' - -# Initialize the lexer -lex.lex() - -The tokens list is required and contains a complete list of all valid -token types that the lexer is allowed to produce. Token types are -restricted to be valid identifiers. This means that 'MINUS' is a valid -token type whereas '-' is not. - -Rules are defined by writing a function with a name of the form -t_rulename. Each rule must accept a single argument which is -a token object generated by the lexer. This token has the following -attributes: - - t.type = type string of the token. This is initially set to the - name of the rule without the leading t_ - t.value = The value of the lexeme. - t.lineno = The value of the line number where the token was encountered - -For example, the t_NUMBER() rule above might be called with the following: - - t.type = 'NUMBER' - t.value = '42' - t.lineno = 3 - -Each rule returns the token object it would like to supply to the -parser. In most cases, the token t is returned with few, if any -modifications. To discard a token for things like whitespace or -comments, simply return nothing. For instance: - -def t_whitespace(t): - r' \s+ ' - pass - -For faster lexing, you can also define this in terms of the ignore set like this: - -t_ignore = ' \t' - -The characters in this string are ignored by the lexer. Use of this feature can speed -up parsing significantly since scanning will immediately proceed to the next token. +__version__ = "2.2" -lex requires that the token returned by each rule has an attribute -t.type. Other than this, rules are free to return any kind of token -object that they wish and may construct a new type of token object -from the attributes of t (provided the new object has the required -type attribute). +import re, sys, types -If illegal characters are encountered, the scanner executes the -function t_error(t) where t is a token representing the rest of the -string that hasn't been matched. If this function isn't defined, a -LexError exception is raised. The .text attribute of this exception -object contains the part of the string that wasn't matched. - -The t.skip(n) method can be used to skip ahead n characters in the -input stream. This is usually only used in the error handling rule. -For instance, the following rule would print an error message and -continue: - -def t_error(t): - print "Illegal character in input %s" % t.value[0] - t.skip(1) - -Of course, a nice scanner might wish to skip more than one character -if the input looks very corrupted. - -The lex module defines a t.lineno attribute on each token that can be used -to track the current line number in the input. The value of this -variable is not modified by lex so it is up to your lexer module -to correctly update its value depending on the lexical properties -of the input language. To do this, you might write rules such as -the following: - -def t_newline(t): - r' \n+ ' - t.lineno += t.value.count("\n") - -To initialize your lexer so that it can be used, simply call the lex.lex() -function in your rule file. If there are any errors in your -specification, warning messages or an exception will be generated to -alert you to the problem. - -To use the newly constructed lexer from another module, simply do -this: - - import lex - import lexer - lex.input("position = initial + rate*60") - - while 1: - token = lex.token() # Get a token - if not token: break # No more tokens - ... do whatever ... - -Assuming that the module 'lexer' has initialized lex as shown -above, parsing modules can safely import 'lex' without having -to import the rule file or any additional imformation about the -scanner you have defined. -""" - -# ----------------------------------------------------------------------------- - - -__version__ = "2.1" - -import re, types, sys, copy +# Regular expression used to match valid token names +_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') # Available instance types. This is used when lexers are defined by a class. # It's a little funky because I want to preserve backwards compatibility @@ -184,7 +39,8 @@ except AttributeError: _INSTANCETYPE = types.InstanceType class object: pass # Note: needed if no new-style classes present -# Exception thrown when invalid token encountered and no default +# Exception thrown when invalid token encountered and no default error +# handler is defined. class LexError(Exception): def __init__(self,message,s): self.args = (message,) @@ -197,10 +53,7 @@ class LexToken(object): def __repr__(self): return str(self) def skip(self,n): - try: - self._skipn += n - except AttributeError: - self._skipn = n + self.lexer.skip(n) # ----------------------------------------------------------------------------- # Lexer class @@ -217,14 +70,21 @@ class Lexer: # tuples (re,findex) where re is a compiled # regular expression and findex is a list # mapping regex group numbers to rules - + self.lexretext = None # Current regular expression strings + self.lexstatere = {} # Dictionary mapping lexer states to master regexs + self.lexstateretext = {} # Dictionary mapping lexer states to regex strings + self.lexstate = "INITIAL" # Current lexer state + self.lexstatestack = [] # Stack of lexer states + self.lexstateinfo = None # State information + self.lexstateignore = {} # Dictionary of ignored characters for each state + self.lexstateerrorf = {} # Dictionary of error functions for each state self.lexreflags = 0 # Optional re compile flags self.lexdata = None # Actual input data (as a string) self.lexpos = 0 # Current position in input text self.lexlen = 0 # Length of the input text self.lexerrorf = None # Error rule (if any) self.lextokens = None # List of valid tokens - self.lexignore = None # Ignored characters + self.lexignore = "" # Ignored characters self.lexliterals = "" # Literal characters that can be passed through self.lexmodule = None # Module self.lineno = 1 # Current line number @@ -233,14 +93,18 @@ class Lexer: def clone(self,object=None): c = Lexer() - c.lexre = self.lexre + c.lexstatere = self.lexstatere + c.lexstateinfo = self.lexstateinfo + c.lexstateretext = self.lexstateretext + c.lexstate = self.lexstate + c.lexstatestack = self.lexstatestack + c.lexstateignore = self.lexstateignore + c.lexstateerrorf = self.lexstateerrorf c.lexreflags = self.lexreflags c.lexdata = self.lexdata c.lexpos = self.lexpos c.lexlen = self.lexlen - c.lexerrorf = self.lexerrorf c.lextokens = self.lextokens - c.lexignore = self.lexignore c.lexdebug = self.lexdebug c.lineno = self.lineno c.lexoptimize = self.lexoptimize @@ -248,25 +112,88 @@ class Lexer: c.lexmodule = self.lexmodule # If the object parameter has been supplied, it means we are attaching the - # lexer to a new object. In this case, we have to rebind the methods + # lexer to a new object. In this case, we have to rebind all methods in + # the lexstatere and lexstateerrorf tables. if object: - newre = [] - for cre, findex in c.lexre: - # Loop over findex and adjust methods - newfindex = [] - for f in findex: - if not f or not f[0]: - newfindex.append(f) - continue - newfindex.append((getattr(object,f[0].__name__),f[1])) - newre.append((cre,newfindex)) - c.lexre = newre + newtab = { } + for key, ritem in self.lexstatere.items(): + newre = [] + for cre, findex in ritem: + newfindex = [] + for f in findex: + if not f or not f[0]: + newfindex.append(f) + continue + newfindex.append((getattr(object,f[0].__name__),f[1])) + newre.append((cre,newfindex)) + newtab[key] = newre + c.lexstatere = newtab + c.lexstateerrorf = { } + for key, ef in self.lexstateerrorf.items(): + c.lexstateerrorf[key] = getattr(object,ef.__name__) c.lexmodule = object + # Set up other attributes + c.begin(c.lexstate) return c # ------------------------------------------------------------ + # writetab() - Write lexer information to a table file + # ------------------------------------------------------------ + def writetab(self,tabfile): + tf = open(tabfile+".py","w") + tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) + tf.write("_lextokens = %s\n" % repr(self.lextokens)) + tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) + tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) + tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) + + tabre = { } + for key, lre in self.lexstatere.items(): + titem = [] + for i in range(len(lre)): + titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1]))) + tabre[key] = titem + + tf.write("_lexstatere = %s\n" % repr(tabre)) + tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) + + taberr = { } + for key, ef in self.lexstateerrorf.items(): + if ef: + taberr[key] = ef.__name__ + else: + taberr[key] = None + tf.write("_lexstateerrorf = %s\n" % repr(taberr)) + tf.close() + + # ------------------------------------------------------------ + # readtab() - Read lexer information from a tab file + # ------------------------------------------------------------ + def readtab(self,tabfile,fdict): + exec "import %s as lextab" % tabfile + self.lextokens = lextab._lextokens + self.lexreflags = lextab._lexreflags + self.lexliterals = lextab._lexliterals + self.lexstateinfo = lextab._lexstateinfo + self.lexstateignore = lextab._lexstateignore + self.lexstatere = { } + self.lexstateretext = { } + for key,lre in lextab._lexstatere.items(): + titem = [] + txtitem = [] + for i in range(len(lre)): + titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict))) + txtitem.append(lre[i][0]) + self.lexstatere[key] = titem + self.lexstateretext[key] = txtitem + self.lexstateerrorf = { } + for key,ef in lextab._lexstateerrorf.items(): + self.lexstateerrorf[key] = fdict[ef] + self.begin('INITIAL') + + # ------------------------------------------------------------ # input() - Push a new string into the lexer # ------------------------------------------------------------ def input(self,s): @@ -277,6 +204,43 @@ class Lexer: self.lexlen = len(s) # ------------------------------------------------------------ + # begin() - Changes the lexing state + # ------------------------------------------------------------ + def begin(self,state): + if not self.lexstatere.has_key(state): + raise ValueError, "Undefined state" + self.lexre = self.lexstatere[state] + self.lexretext = self.lexstateretext[state] + self.lexignore = self.lexstateignore.get(state,"") + self.lexerrorf = self.lexstateerrorf.get(state,None) + self.lexstate = state + + # ------------------------------------------------------------ + # push_state() - Changes the lexing state and saves old on stack + # ------------------------------------------------------------ + def push_state(self,state): + self.lexstatestack.append(self.lexstate) + self.begin(state) + + # ------------------------------------------------------------ + # pop_state() - Restores the previous state + # ------------------------------------------------------------ + def pop_state(self): + self.begin(self.lexstatestack.pop()) + + # ------------------------------------------------------------ + # current_state() - Returns the current lexing state + # ------------------------------------------------------------ + def current_state(self): + return self.lexstate + + # ------------------------------------------------------------ + # skip() - Skip ahead n characters + # ------------------------------------------------------------ + def skip(self,n): + self.lexpos += n + + # ------------------------------------------------------------ # token() - Return the next token from the Lexer # # Note: This function has been carefully implemented to be as fast @@ -301,6 +265,10 @@ class Lexer: m = lexre.match(lexdata,lexpos) if not m: continue + # Set last match in lexer so that rules can access it if they want + self.lexmatch = m + + # Create a token for return tok = LexToken() tok.value = m.group() tok.lineno = self.lineno @@ -313,8 +281,14 @@ class Lexer: self.lexpos = lexpos if not func: - return tok - + # If no token type was set, it's an ignored token + if tok.type: return tok + break + + # if func not callable, it means it's an ignored token + if not callable(func): + break + # If token is processed by a function, call it newtok = func(tok) @@ -351,19 +325,17 @@ class Lexer: tok.type = "error" tok.lexer = self tok.lexpos = lexpos - oldpos = lexpos + self.lexpos = lexpos newtok = self.lexerrorf(tok) - lexpos += getattr(tok,"_skipn",0) - if oldpos == lexpos: + if lexpos == self.lexpos: # Error method didn't change text position at all. This is an error. - self.lexpos = lexpos raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) + lexpos = self.lexpos if not newtok: continue - self.lexpos = lexpos return newtok self.lexpos = lexpos - raise LexError, ("No match found", lexdata[lexpos:]) + raise LexError, ("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) self.lexpos = lexpos + 1 if self.lexdata is None: @@ -371,14 +343,14 @@ class Lexer: return None # ----------------------------------------------------------------------------- -# validate_file() +# _validate_file() # # This checks to see if there are duplicated t_rulename() functions or strings # in the parser input file. This is done using a simple regular expression # match on each line in the filename. # ----------------------------------------------------------------------------- -def validate_file(filename): +def _validate_file(filename): import os.path base,ext = os.path.splitext(filename) if ext != '.py': return 1 # No idea what the file is. Return OK @@ -411,29 +383,36 @@ def validate_file(filename): return noerror # ----------------------------------------------------------------------------- -# _read_lextab(module) +# _funcs_to_names() # -# Reads lexer table from a lextab file instead of using introspection. +# Given a list of regular expression functions, this converts it to a list +# suitable for output to a table file # ----------------------------------------------------------------------------- -def _read_lextab(lexer, fdict, module): - exec "import %s as lextab" % module - lexre = [] - for regex,ltab in lextab._lexre: - ftab = [] - for t in ltab: - if t and t[0]: - ftab.append((fdict[t[0]], t[1])) - else: - ftab.append(t) - lexre.append((re.compile(regex, re.VERBOSE | lextab._lexreflags), ftab)) - lexer.lexre = lexre - lexer.lexreflags = lextab._lexreflags - lexer.lextokens = lextab._lextokens - lexer.lexignore = lextab._lexignore - if lextab._lexerrorf: - lexer.lexerrorf = fdict[lextab._lexerrorf] - lexer.lexliterals = lextab._lexliterals +def _funcs_to_names(funclist): + result = [] + for f in funclist: + if f and f[0]: + result.append((f[0].__name__,f[1])) + else: + result.append(f) + return result + +# ----------------------------------------------------------------------------- +# _names_to_funcs() +# +# Given a list of regular expression function names, this converts it back to +# functions. +# ----------------------------------------------------------------------------- + +def _names_to_funcs(namelist,fdict): + result = [] + for n in namelist: + if n and n[0]: + result.append((fdict[n[0]],n[1])) + else: + result.append(n) + return result # ----------------------------------------------------------------------------- # _form_master_re() @@ -458,7 +437,11 @@ def _form_master_re(relist,reflags,ldict): elif handle is not None: # If rule was specified as a string, we build an anonymous # callback function to carry out the action - lexindexfunc[i] = (None,f[2:]) + if f.find("ignore_") > 0: + lexindexfunc[i] = (None,None) + print "IGNORE", f + else: + lexindexfunc[i] = (None, f[2:]) return [(lexre,lexindexfunc)],[regex] except Exception,e: @@ -467,22 +450,50 @@ def _form_master_re(relist,reflags,ldict): llist, lre = _form_master_re(relist[:m],reflags,ldict) rlist, rre = _form_master_re(relist[m:],reflags,ldict) return llist+rlist, lre+rre - + +# ----------------------------------------------------------------------------- +# def _statetoken(s,names) +# +# Given a declaration name s of the form "t_" and a dictionary whose keys are +# state names, this function returns a tuple (states,tokenname) where states +# is a tuple of state names and tokenname is the name of the token. For example, +# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') +# ----------------------------------------------------------------------------- + +def _statetoken(s,names): + nonstate = 1 + parts = s.split("_") + for i in range(1,len(parts)): + if not names.has_key(parts[i]) and parts[i] != 'ANY': break + if i > 1: + states = tuple(parts[1:i]) + else: + states = ('INITIAL',) + + if 'ANY' in states: + states = tuple(names.keys()) + + tokenname = "_".join(parts[i:]) + return (states,tokenname) + # ----------------------------------------------------------------------------- # lex(module) # # Build all of the regular expression rules from definitions in the supplied module # ----------------------------------------------------------------------------- -def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0): +def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0): global lexer ldict = None - regex_list = [] + stateinfo = { 'INITIAL' : 'inclusive'} error = 0 files = { } lexobj = Lexer() lexobj.lexdebug = debug lexobj.lexoptimize = optimize global token,input + + if nowarn: warn = 0 + else: warn = 1 if object: module = object @@ -511,8 +522,7 @@ def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0): if optimize and lextab: try: - _read_lextab(lexobj,ldict, lextab) - if not lexobj.lexignore: lexobj.lexignore = "" + lexobj.readtab(lextab,ldict) token = lexobj.token input = lexobj.input lexer = lexobj @@ -521,14 +531,15 @@ def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0): except ImportError: pass - # Get the tokens map + # Get the tokens, states, and literals variables (if any) if (module and isinstance(module,_INSTANCETYPE)): - tokens = getattr(module,"tokens",None) + tokens = getattr(module,"tokens",None) + states = getattr(module,"states",None) + literals = getattr(module,"literals","") else: - try: - tokens = ldict["tokens"] - except KeyError: - tokens = None + tokens = ldict.get("tokens",None) + states = ldict.get("states",None) + literals = ldict.get("literals","") if not tokens: raise SyntaxError,"lex: module does not define 'tokens'" @@ -538,221 +549,274 @@ def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0): # Build a dictionary of valid token names lexobj.lextokens = { } if not optimize: - - # Utility function for verifying tokens - def is_identifier(s): - for c in s: - if not (c.isalnum() or c == '_'): return 0 - return 1 - for n in tokens: - if not is_identifier(n): + if not _is_identifier.match(n): print "lex: Bad token name '%s'" % n error = 1 - if lexobj.lextokens.has_key(n): + if warn and lexobj.lextokens.has_key(n): print "lex: Warning. Token '%s' multiply defined." % n lexobj.lextokens[n] = None else: for n in tokens: lexobj.lextokens[n] = None - if debug: print "lex: tokens = '%s'" % lexobj.lextokens.keys() - # Get literals - if (module and isinstance(module,_INSTANCETYPE)): - literals = getattr(module,"literals","") - else: - try: - literals = ldict["literals"] - except KeyError: - literals = "" + try: + for c in literals: + if not (isinstance(c,types.StringType) or isinstance(c,types.UnicodeType)) or len(c) > 1: + print "lex: Invalid literal %s. Must be a single character" % repr(c) + error = 1 + continue + + except TypeError: + print "lex: Invalid literals specification. literals must be a sequence of characters." + error = 1 lexobj.lexliterals = literals - # Get a list of symbols with the t_ prefix - tsymbols = [f for f in ldict.keys() if f[:2] == 't_'] - + # Build statemap + if states: + if not (isinstance(states,types.TupleType) or isinstance(states,types.ListType)): + print "lex: states must be defined as a tuple or list." + error = 1 + else: + for s in states: + if not isinstance(s,types.TupleType) or len(s) != 2: + print "lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s) + error = 1 + continue + name, statetype = s + if not isinstance(name,types.StringType): + print "lex: state name %s must be a string" % repr(name) + error = 1 + continue + if not (statetype == 'inclusive' or statetype == 'exclusive'): + print "lex: state type for state %s must be 'inclusive' or 'exclusive'" % name + error = 1 + continue + if stateinfo.has_key(name): + print "lex: state '%s' already defined." % name + error = 1 + continue + stateinfo[name] = statetype + + # Get a list of symbols with the t_ or s_ prefix + tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ] + # Now build up a list of functions and a list of strings - fsymbols = [ ] - ssymbols = [ ] + + funcsym = { } # Symbols defined as functions + strsym = { } # Symbols defined as strings + toknames = { } # Mapping of symbols to token names + + for s in stateinfo.keys(): + funcsym[s] = [] + strsym[s] = [] + + ignore = { } # Ignore strings by state + errorf = { } # Error functions by state + + if len(tsymbols) == 0: + raise SyntaxError,"lex: no rules of the form t_rulename are defined." + for f in tsymbols: - if callable(ldict[f]): - fsymbols.append(ldict[f]) - elif (isinstance(ldict[f], types.StringType) or isinstance(ldict[f],types.UnicodeType)): - ssymbols.append((f,ldict[f])) + t = ldict[f] + states, tokname = _statetoken(f,stateinfo) + toknames[f] = tokname + + if callable(t): + for s in states: funcsym[s].append((f,t)) + elif (isinstance(t, types.StringType) or isinstance(t,types.UnicodeType)): + for s in states: strsym[s].append((f,t)) else: print "lex: %s not defined as a function or string" % f error = 1 - + # Sort the functions by line number - fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) + for f in funcsym.values(): + f.sort(lambda x,y: cmp(x[1].func_code.co_firstlineno,y[1].func_code.co_firstlineno)) # Sort the strings by regular expression length - ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) - - # Check for non-empty symbols - if len(fsymbols) == 0 and len(ssymbols) == 0: - raise SyntaxError,"lex: no rules of the form t_rulename are defined." + for s in strsym.values(): + s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) - # Add all of the rules defined with actions first - for f in fsymbols: - - line = f.func_code.co_firstlineno - file = f.func_code.co_filename - files[file] = None + regexs = { } - ismethod = isinstance(f, types.MethodType) + # Build the master regular expressions + for state in stateinfo.keys(): + regex_list = [] - if not optimize: - nargs = f.func_code.co_argcount - if ismethod: - reqargs = 2 - else: - reqargs = 1 - if nargs > reqargs: - print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) - error = 1 - continue + # Add rules defined by functions first + for fname, f in funcsym[state]: + line = f.func_code.co_firstlineno + file = f.func_code.co_filename + files[file] = None + tokname = toknames[fname] - if nargs < reqargs: - print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) - error = 1 - continue + ismethod = isinstance(f, types.MethodType) - if f.__name__ == 't_ignore': - print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) - error = 1 - continue + if not optimize: + nargs = f.func_code.co_argcount + if ismethod: + reqargs = 2 + else: + reqargs = 1 + if nargs > reqargs: + print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) + error = 1 + continue + + if nargs < reqargs: + print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) + error = 1 + continue + + if tokname == 'ignore': + print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) + error = 1 + continue - if f.__name__ == 't_error': - lexobj.lexerrorf = f - continue + if tokname == 'error': + errorf[state] = f + continue + + if f.__doc__: + if not optimize: + try: + c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags) + if c.match(""): + print "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__) + error = 1 + continue + except re.error,e: + print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) + if '#' in f.__doc__: + print "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__) + error = 1 + continue + + if debug: + print "lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state) + + # Okay. The regular expression seemed okay. Let's append it to the master regular + # expression we're building + + regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__)) + else: + print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) + + # Now add all of the simple rules + for name,r in strsym[state]: + tokname = toknames[name] + + if tokname == 'ignore': + ignore[state] = r + continue - if f.__doc__: if not optimize: + if tokname == 'error': + raise SyntaxError,"lex: Rule '%s' must be defined as a function" % name + error = 1 + continue + + if not lexobj.lextokens.has_key(tokname) and tokname.find("ignore_") < 0: + print "lex: Rule '%s' defined for an unspecified token %s." % (name,tokname) + error = 1 + continue try: - c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags) - if c.match(""): - print "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__) + c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags) + if (c.match("")): + print "lex: Regular expression for rule '%s' matches empty string." % name error = 1 continue except re.error,e: - print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) - if '#' in f.__doc__: - print "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__) + print "lex: Invalid regular expression for rule '%s'. %s" % (name,e) + if '#' in r: + print "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name + error = 1 continue - if debug: - print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__) - - # Okay. The regular expression seemed okay. Let's append it to the master regular - # expression we're building - - regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__)) - else: - print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) + print "lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state) + + regex_list.append("(?P<%s>%s)" % (name,r)) - # Now add all of the simple rules - for name,r in ssymbols: + if not regex_list: + print "lex: No rules defined for state '%s'" % state + error = 1 - if name == 't_ignore': - lexobj.lexignore = r - continue - - if not optimize: - if name == 't_error': - raise SyntaxError,"lex: Rule 't_error' must be defined as a function" - error = 1 - continue - - if not lexobj.lextokens.has_key(name[2:]): - print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:]) - error = 1 - continue - try: - c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags) - if (c.match("")): - print "lex: Regular expression for rule '%s' matches empty string." % name - error = 1 - continue - except re.error,e: - print "lex: Invalid regular expression for rule '%s'. %s" % (name,e) - if '#' in r: - print "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name + regexs[state] = regex_list - error = 1 - continue - if debug: - print "lex: Adding rule %s -> '%s'" % (name,r) - - regex_list.append("(?P<%s>%s)" % (name,r)) if not optimize: - for f in files.keys(): - if not validate_file(f): + for f in files.keys(): + if not _validate_file(f): error = 1 - try: - lexobj.lexre, re_groups = _form_master_re(regex_list,reflags,ldict) - if debug: - for i in range(len(re_groups)): - print "lex: regex[%d] = '%s'" % (i, re_groups[i]) - - # If a lextab was specified, we create a file containing the precomputed - # regular expression and index table - - if lextab and optimize: - lt = open(lextab+".py","w") - lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab) - - tabre = [] - for i in range(len(re_groups)): - indexf = [] - for t in lexobj.lexre[i][1]: - if t: - if t[0]: - indexf.append((t[0].__name__,t[1])) - else: - indexf.append((None,t[1])) - else: - indexf.append(None) - tabre.append((re_groups[i],indexf)) - - lt.write("_lexre = %s\n" % repr(tabre)) - - # Create an alternative lexre representation - - lt.write("_lexreflags = %d\n" % reflags) - lt.write("_lextokens = %s\n" % repr(lexobj.lextokens)) - lt.write("_lexignore = %s\n" % repr(lexobj.lexignore)) - if (lexobj.lexerrorf): - lt.write("_lexerrorf = %s\n" % repr(lexobj.lexerrorf.__name__)) - else: - lt.write("_lexerrorf = None\n") - lt.write("_lexliterals = %s\n" % repr(lexobj.lexliterals)) - lt.close() - - except re.error,e: - print "lex: Fatal error. Unable to compile regular expression rules. %s" % e - error = 1 if error: raise SyntaxError,"lex: Unable to build lexer." - if not lexobj.lexerrorf: + + # From this point forward, we're reasonably confident that we can build the lexer. + # No more errors will be generated, but there might be some warning messages. + + # Build the master regular expressions + + for state in regexs.keys(): + lexre, re_text = _form_master_re(regexs[state],reflags,ldict) + lexobj.lexstatere[state] = lexre + lexobj.lexstateretext[state] = re_text + if debug: + for i in range(len(re_text)): + print "lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i]) + + # For inclusive states, we need to add the INITIAL state + for state,type in stateinfo.items(): + if state != "INITIAL" and type == 'inclusive': + lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) + lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) + + lexobj.lexstateinfo = stateinfo + lexobj.lexre = lexobj.lexstatere["INITIAL"] + lexobj.lexretext = lexobj.lexstateretext["INITIAL"] + + # Set up ignore variables + lexobj.lexstateignore = ignore + lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") + + # Set up error functions + lexobj.lexstateerrorf = errorf + lexobj.lexerrorf = errorf.get("INITIAL",None) + if warn and not lexobj.lexerrorf: print "lex: Warning. no t_error rule is defined." - if not lexobj.lexignore: lexobj.lexignore = "" - + # Check state information for ignore and error rules + for s,stype in stateinfo.items(): + if stype == 'exclusive': + if warn and not errorf.has_key(s): + print "lex: Warning. no error rule is defined for exclusive state '%s'" % s + if warn and not ignore.has_key(s) and lexobj.lexignore: + print "lex: Warning. no ignore rule is defined for exclusive state '%s'" % s + elif stype == 'inclusive': + if not errorf.has_key(s): + errorf[s] = errorf.get("INITIAL",None) + if not ignore.has_key(s): + ignore[s] = ignore.get("INITIAL","") + + # Create global versions of the token() and input() functions token = lexobj.token input = lexobj.input lexer = lexobj - + + # If in optimize mode, we write the lextab + if lextab and optimize: + lexobj.writetab(lextab) + return lexobj # ----------------------------------------------------------------------------- -# run() +# runmain() # # This runs the lexer as a main program # ----------------------------------------------------------------------------- @@ -797,6 +861,6 @@ def TOKEN(r): return f return set_doc - - +# Alternative spelling of the TOKEN decorator +Token = TOKEN |