diff --git a/src/ccg2xml/lex.py b/src/ccg2xml/lex.py index 94453d7..7c559fd 100644 --- a/src/ccg2xml/lex.py +++ b/src/ccg2xml/lex.py @@ -11,19 +11,19 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# +# # See the file COPYING for a complete copy of the LGPL. # -# +# # This module automatically constructs a lexical analysis module from regular # expression rules defined in a user-defined module. The idea is essentially the same # as that used in John Aycock's Spark framework, but the implementation works @@ -102,9 +102,9 @@ def t_NUMBER(t): name of the rule without the leading t_ t.value = The value of the lexeme. t.lineno = The value of the line number where the token was encountered - + For example, the t_NUMBER() rule above might be called with the following: - + t.type = 'NUMBER' t.value = '42' t.lineno = 3 @@ -182,7 +182,7 @@ def t_newline(t): above, parsing modules can safely import 'plex' without having to import the rule file or any additional imformation about the scanner you have defined. -""" +""" # ----------------------------------------------------------------------------- @@ -241,11 +241,11 @@ def __copy__(self): c.lexerrorf = self.lexerrorf c.lextokens = self.lextokens c.lexignore = self.lexignore - c.debug = self.debug + c.debug = self.debug c.lineno = self.lineno c.optimize = self.optimize c.token = c.realtoken - return c + return c # ------------------------------------------------------------ # input() - Push a new string into the lexer @@ -257,7 +257,7 @@ def input(self,s): self.lexpos = 0 self.lexlen = len(s) self.token = self.realtoken - + # Change the token routine to point to realtoken() global token if token == self.errtoken: @@ -268,7 +268,7 @@ def input(self,s): # ------------------------------------------------------------ def errtoken(self): raise RuntimeError, "No input string given with input()" - + # ------------------------------------------------------------ # token() - Return the next token from the Lexer # @@ -282,7 +282,7 @@ def realtoken(self): lexlen = self.lexlen lexignore = self.lexignore lexdata = self.lexdata - + while lexpos < lexlen: # This code provides some short-circuit code for whitespace, tabs, and other ignored characters if lexdata[lexpos] in lexignore: @@ -302,15 +302,15 @@ def realtoken(self): if not func: self.lexpos = lexpos return tok - + # If token is processed by a function, call it self.lexpos = lexpos newtok = func(tok) self.lineno = tok.lineno # Update line number - + # Every function must return a token, if nothing, we just move to next token if not newtok: continue - + # Verify type of the token. If not in the token map, raise an error if not self.optimize: if not self.lextokens.has_key(newtok.type): @@ -345,7 +345,7 @@ def realtoken(self): self.lexpos = lexpos + 1 return None - + # ----------------------------------------------------------------------------- # validate_file() # @@ -405,7 +405,7 @@ def _read_lextab(lexer, fdict, module): lexer.lexignore = lextab._lexignore if lextab._lexerrorf: lexer.lexerrorf = fdict[lextab._lexerrorf] - + # ----------------------------------------------------------------------------- # lex(module) # @@ -420,7 +420,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab"): lexer.debug = debug lexer.optimize = optimize global token,input - + if module: # User supplied a module object. if isinstance(module, types.ModuleType): @@ -432,7 +432,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab"): ldict[i] = v else: raise ValueError,"Expected a module or instance" - + else: # No module given. We might be able to get information from the caller. try: @@ -446,14 +446,14 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab"): if optimize and lextab: try: _read_lextab(lexer,ldict, lextab) - if not lexer.lexignore: lexer.lexignore = "" + if not lexer.lexignore: lexer.lexignore = "" token = lexer.token input = lexer.input return lexer - + except ImportError: pass - + # Get the tokens map if (module and isinstance(module,types.InstanceType)): tokens = getattr(module,"tokens",None) @@ -462,7 +462,7 @@ def lex(module=None,debug=0,optimize=0,lextab="lextab"): tokens = ldict["tokens"] except KeyError: tokens = None - + if not tokens: raise SyntaxError,"lex: module does not define 'tokens'" if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): @@ -477,7 +477,7 @@ def is_identifier(s): for c in s: if not (c.isalnum() or c == '_'): return 0 return 1 - + for n in tokens: if not is_identifier(n): print "lex: Bad token name '%s'" % n @@ -487,14 +487,14 @@ def is_identifier(s): lexer.lextokens[n] = None else: for n in tokens: lexer.lextokens[n] = None - + if debug: print "lex: tokens = '%s'" % lexer.lextokens.keys() # Get a list of symbols with the t_ prefix tsymbols = [f for f in ldict.keys() if f[:2] == 't_'] - + # Now build up a list of functions and a list of strings fsymbols = [ ] ssymbols = [ ] @@ -506,20 +506,20 @@ def is_identifier(s): else: print "lex: %s not defined as a function or string" % f error = 1 - + # Sort the functions by line number fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) # Sort the strings by regular expression length ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) - + # Check for non-empty symbols if len(fsymbols) == 0 and len(ssymbols) == 0: raise SyntaxError,"lex: no rules of the form t_rulename are defined." # Add all of the rules defined with actions first for f in fsymbols: - + line = f.func_code.co_firstlineno file = f.func_code.co_filename files[file] = None @@ -546,7 +546,7 @@ def is_identifier(s): print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) error = 1 continue - + if f.__name__ == 't_error': lexer.lexerrorf = f continue @@ -565,7 +565,7 @@ def is_identifier(s): # Okay. The regular expression seemed okay. Let's append it to the master regular # expression we're building - + if (regex): regex += "|" regex += "(?P<%s>%s)" % (f.__name__,f.__doc__) else: @@ -577,13 +577,13 @@ def is_identifier(s): if name == 't_ignore': lexer.lexignore = r continue - + if not optimize: if name == 't_error': raise SyntaxError,"lex: Rule 't_error' must be defined as a function" error = 1 continue - + if not lexer.lextokens.has_key(name[2:]): print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:]) error = 1 @@ -596,7 +596,7 @@ def is_identifier(s): continue if debug: print "lex: Adding rule %s -> '%s'" % (name,r) - + if regex: regex += "|" regex += "(?P<%s>%s)" % (name,r) @@ -622,7 +622,7 @@ def is_identifier(s): # If a lextab was specified, we create a file containing the precomputed # regular expression and index table - + if lextab and optimize: lt = open(lextab+".py","w") lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab) @@ -637,7 +637,7 @@ def is_identifier(s): lt.write(" (None,%s),\n" % repr(t[1])) else: lt.write(" None,\n") - + lt.write("]\n"); lt.write("_lextokens = %s\n" % repr(lexer.lextokens)) lt.write("_lexignore = %s\n" % repr(lexer.lexignore)) @@ -646,7 +646,7 @@ def is_identifier(s): else: lt.write("_lexerrorf = None\n") lt.close() - + except re.error,e: print "lex: Fatal error. Unable to compile regular expression rules. %s" % e error = 1 @@ -656,11 +656,11 @@ def is_identifier(s): print "lex: Warning. no t_error rule is defined." if not lexer.lexignore: lexer.lexignore = "" - + # Create global versions of the token() and input() functions token = lexer.token input = lexer.input - + return lexer # ----------------------------------------------------------------------------- @@ -689,12 +689,8 @@ def runmain(lexer=None,data=None): _token = lexer.token else: _token = token - + while 1: tok = _token() if not tok: break print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno) - - - -