#!/usr/bin/python

# Parsely - A cross-language tool for parsing and file manipulation.
#
# Copyright (C) 1999-2000 Nick Mathewson
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Library General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Library General Public License for more details.
#
# You should have received a copy of the GNU Library General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.

"""PCRE-based scanner.  Based loosely on code from John Aycock's SPARK
   package."""

import re, string
from types import StringType

import parsely # for ParselyException, use_pcre_hack
from earley_parser import _Token
from _util import fileContents

####
## Scanner:
####
class PCREScanner:
    """PCREScanner uses Python's PCRE package to tokenize an input stream
       according to some file format.  Each state of the scanner is
       represented by a ScannerState object.

       The efficiency is limited by that of Python's re module.

       This scanner should be thread-safe and reentrant; I haven't
       tested it, though."""
    
    ##Fields
    # - Scanner strtucture
    # DistinctLexemes:
    #   An alias to format.DistinctLexemes. Maps names to lexeme objects.
    # StartState:
    #   The name of the starting state.
    # StateNames:
    #   A list of all state names
    # States:
    #   A mapping from state names to ScannerState objects.
    # debug:
    #   Are we debugging? (boolean)
    # ContextClass:
    #   The class for ScanContexts of thie scanner.

    def __init__(self, format, debug=0):
        """Constructs a new scanner for the file format <format>."""
        self.DistinctLexemes = format.DistinctLexemes
        self.debug = debug        
	self.StartState = format.StartState
	self.StateNames = [ "INITIAL" ]

        self.States = { "INITIAL" : _ScannerState("INITIAL",0) }

        # Compile a list of state names, and see whether space is accepted
        # for each.
	for name,state in format.States.items():
	    if name != "INITIAL":
		self.StateNames.append(name)
            self.States[name]=_ScannerState(name,state.exclusive)
            
        # Ensure a start state exists.
	if not self.StartState:
	    self.StartState = "INITIAL"
    
        # For each lexeme, put the appropriate pattern into whatever
        # states are appropriate
        for t in format.lexemeList:
	    for s in t.getStates():
                self.States[s].addLexeme(t)

        # Set the options for the regexen.
        reOptions = re.M
        if format.Options["dotall"]:
            reOptions = reOptions + re.DOTALL
        if format.Options["nocase"]:
            reOptions = reOptions + re.I
                
        # Compile regexen in all states.
        for s in self.States.values():
            s.compile(reOptions, format)

	self.ContextClass = self._makeContextClass(format)

    def _makeContextClass(self,format):
	actLists = {'initScan': [], 'finishScan':[], 'scanFn': []}

	class ContextC(ScanContext):
	    def __init__(self,scanner, tokenization, strng):
		ScanContext.__init__(self,scanner,tokenization,strng)
		for fnId in self.__class__._initActions:
		    self._callAction(fnId)
		    
	    def _finish(self):
		for fnId in self.__class__._finishActions:		    
		    self._callAction(fnId)

	    def _callAction(self, actionName):		
		apply(getattr(self,actionName), ())

	for lex in self.DistinctLexemes.values():
	    if lex.the_action:
		setattr(ContextC, lex.actionName, lex.the_action)

	for actName,actionMap in format.Actions.items():
	    if actionMap.has_key('python'):
		action = actionMap['python']
		if actLists.has_key(action.kind):
		    setattr(ContextC, actName, action.compiled_fn)
		    actLists[action.kind].append(actName)

	ContextC._initActions = actLists['initScan']
	ContextC._finishActions = actLists['finishScan']

	return ContextC
	
    def tokenize(self, s, returnContext=0, useContext=None):
        """Tokenizes the string s.  It returns its results as a list of
           a single string (the initial space), and _Token objects for each
           token. 

	   If useContext is provided, operates in the given ScanContext.  
	   Otherwise, creates a new context."""

        #XXXX useContext disabled.

        # Length of the string before we begin processing
	initialLen = len(s)
        # What line are we on?
	lineNumber = 1

        # Which _Tokens have we seen?  The first is used only for space.
        tokenization = [ _Token(None, "", "", 0) ]
        # How many characters of incomprehensible stuff have we seen?
	errLen = 0
        # Where did the current error start?
        errStartsOnLineNumber = -1

        #if useContext:
        #    Context = useContext
        #else:
	
	Context = self.ContextClass(self, tokenization, s)
            
        State = self.States[self.StartState]

        pos = 0
        end = len(s)

        # So long as the string isn't done...
        while pos < end:
            # line at the start of the token
            lStart = lineNumber
            # Get the next token
            lexeme,val,posNext = State.scanOneLexeme(s,pos,end)

            # Handle an error:
            if lexeme is None: 
                pos=posNext
		errLen = errLen + 1
                lastTokenVal = tokenization[-1].val
                lastTokenVal.trailingSpace = lastTokenVal.trailingSpace+ s[pos]
                if errStartsOnLineNumber < 0: errStartsOnLineNumber=lineNumber
		if s[pos] == '\n': lineNumber = lineNumber + 1
		pos = pos + 1
		continue

            # If we had an error, but don't any more.
            if errLen:
		Context.error(
	    "%s characters of unrecognized input on line %s [pos=%s state=%s]"
		  % (errLen, lineNumber, pos-errLen, State.name))
		errLen = 0
                errStartsOnLineNumber = -1

            # Reinterpret, if necessary
            if lexeme.isToken():
                typename = lexeme.shortName
                reinterp = Context._reinterprets.get(typename, None)
                if reinterp:
                    for m in reinterp:
                        if m[0](val):
                            lexeme = m[1]
                            break

            # Take the action, if necessary
            if lexeme.the_action:
                Context._setState(pos,len(val),lineNumber,lexeme,State)
		try:
		    Context._callAction(lexeme.actionName)
		except "ExplicitContinue":
		    pass
                pos,val,lexeme,appended,State,include = Context._getState()
            else:
                pos = posNext
                appended=0
                include=None

            # Add the lexeme as needed
            if appended:
                pass
            elif lexeme and lexeme.isToken():
                tokenization.append( _Token(lexeme.type, val, "", lStart) )
            else: # None or space
                tokenization[-1].moreSpace(val)

            if include:
                includeFile, includeType, includeTokens = include
                tokenization[-1]._includeInfo = include
                Context._includeNodes[includeFile] = \
                    (tokenization[-1], includeType, includeTokens)

            # Enter a new state as appropriate
            if lexeme and lexeme.enterState:
                State = self.States[lexeme.enterState]

            # Incremement the line count 
            lineNumber = lineNumber + string.count(val,'\n')            

        # After we're done, report any errors we found at the end of the
        # string
	if errLen:
	    Context.error(
		"%s characters of unrecognized input at lineNumber %s [%s %s]"
		% (errLen, lineNumber, pos-errLen, self.StateName))

        tokenization[0] = tokenization[0].val.trailingSpace
	if Context._errorCount:
	    raise ScanError()

	Context._finish()

        if returnContext:
            return tokenization, Context
        else:
            return tokenization

class _ScannerState:
    """A _ScannerState represents one state of a PCRE scanner.  
       It contains all the logic necessary to match lexemes in that
       state.
       """

    ##Fields
    # RE:
    #   The RegexObject for this state.  It's an alternation of the
    #   form (?P<Name1>pat1)|(?P<Name2>pat2)..., where the names are
    #   distinct names of lexemes.
    # inclusive:
    #   Is this state inclusive? (boolean)
    # name:
    #   The name of this state
    # patterns:
    #   A list of all the (?P<NameX>patP) patterns in this.
    # lexemeAt:
    #   A map from group index to lexeme object.
    # RE_CODE:
    #   Used by pcre_hack.
    
    def __init__(self,name, exclusive):
        """Creates a new scanner state.  The object will not be ready
           for use until all lexemes have been added with addLexeme()
           and the object has been compiled with compile()."""
        self.name = name
        self.inclusive = not exclusive
        self.patterns = []

	self.added = {}#XXXX

    def __getstate__(self):
	return (self.name, self.patterns, self.RE, 
		self.inclusive, self.lexemeAt)

    def __setstate__(self,state):
	self.name, self.patterns, self.RE, self.inclusive, self.lexemeAt \
		   = state
	if parsely.use_pcre_hack:
	    self.RE_CODE = self.RE.code
	    self.scanOneLexeme = self.scanOneLexeme_HACK
	else:
	    self.scanOneLexeme = self.scanOneLexeme_NOHACK

    def addLexeme(self, lexeme):
        """Adds a new lexeme to this state."""
        assert((self.inclusive and not lexeme.inStates) or
               ('*' in lexeme.inStates) or
               (self.name in lexeme.inStates))
        
        pat = lexeme.pattern.pat.write(noGrouping=1,forceI=1)
        pat = "(?P<%s>%s)" % (lexeme.distinctName,pat)
	if self.added.has_key(lexeme.distinctName):
	    raise 1
	self.added[lexeme.distinctName] = 1
        self.patterns.append(pat)

    def compile(self, reOptions,format):
        """Processes the lexemes in this state and prepares to scan."""
        p = string.join(self.patterns, '|')
        self.RE = re.compile(p, reOptions)
        self.lexemeAt = {}

	if parsely.use_pcre_hack:
	    for gname, gidx in self.RE.groupindex.items():
		self.lexemeAt[gidx] = format.DistinctLexemes[gname]

	    self.RE_CODE = self.RE.code
	    self.scanOneLexeme = self.scanOneLexeme_HACK
	else:
	    for gname, gidx in self.RE.groupindex.items():
		self.lexemeAt[gidx-1] = format.DistinctLexemes[gname]
	    self.scanOneLexeme = self.scanOneLexeme_NOHACK

    def scanOneLexeme_NOHACK(self,s,pos,end):
	"""Works the same as scanOneLexeme below, but should work for
	   any version of the re package."""
        match = self.RE.match(s,pos)
        if not match: return None,None,pos
        groups = match.groups()
        for i in range(len(groups)):
            if groups[i]:
                return self.lexemeAt[i], groups[i], match.end()

	assert None

    def scanOneLexeme_HACK(self,s,pos,end):
        """Given a string s, tries to get a lexeme from the beginning of
           the string but before end.  If it finds one, returns
                    (lexemeObject, value, nextPos)
           otherwise, returns:
                    (None, None, pos).

	   This version uses the undocumented re.code field, and is faster,
	   but might break in the future.
	   """

        regs = self.RE_CODE.match(s,pos,end,re.ANCHORED)
        if regs is None: return None,None,pos
        for i in range(1,len(regs)):
            g = regs[i]
            if g[0] == -1 or g[1] == -1:
                continue
            else:
                return self.lexemeAt[i], s[g[0]:g[1]], g[1]

        assert None
        
    
class ScanError(parsely.ParselyException):
    """This exception is raised if the scanning process fails."""
    def __init__(self,*s):
	parsely.ParselyException.__init__(self,s)

class ScanContext:
    """A ScanContext object implements the functionality of scanner actions."""

    ##Fields:
    # _appended:
    #   Have we called ACCUMULATE to append the most recent lexeme to the
    #   last? (boolean)
    # _lineNumber:
    #   Linenumber of the curent lexeme.
    # _s:
    #   The input string.
    # _scanner:
    #   reference to the scanner object.
    # _lex:
    #   the current Lexeme object, or None. (None=Space.)
    # _tokenization:
    #   reference to the token list.
    # _reinterprets:
    #   Map from type names to (predicate,type) tuples.  Used to handle
    #   'reinterpret' directives.
    #    When we see (REINTERPRET('Word', 'z+', 'Plus'), we add
    #    Word => ( lambda s, p=re.compile('z+'): p.match(s),
    #              getType('Plus')
    #            ).
    # match:
    #   the string value of the current match.
    # matchLen:
    #   the length of the current match.
    # type:
    #   the name of the current type, or None for space.
    # _errorCount:
    #   How many errors have we found so far?
    # _state:
    #   The current _ScannerState object.
    # _included:
    #   Either None,None or a filename,token-list pair.
    # _includeNodes
    #   Map from filename to node which includes that filename.

    def __init__(self,scanner,tokenization,strng):
        """Creates a new ScanContext associated with a given scanner and
           token list."""
        self._scanner = scanner
        self._tokenization = tokenization
        self._reinterprets = {}
        self._errorCount = 0
        self._includeNodes = {}
        self._s = strng

    def _setState(self, pos, matchLen, lineNumber, lexeme, State):
        """Sets the current state of the ScanContext.  Called before an
           action."""
        self._pos = pos
        self._lex = lexeme
        self.matchLen = matchLen
        self.match = self._s[pos:pos+matchLen]
        self._lineNumber = lineNumber
        self._state = State
        if lexeme.isToken():
	    self.type = lexeme.shortName
	else:
	    self.type = None
        self._appended = 0
        self._included = None

    def getLineNumber(self):
        return self._lineNumber

    def _getState(self):
        return (self._pos + len(self.match), self.match, self._lex,
                self._appended, self._state, self._included)

    def include(self, fname, type, path=None):
	assert parsely.INCLUDES_ENABLED

        contents = fileContents(fname, path)
        tokens = self._scanner.tokenize(contents)
        self._included = (fname, type, tokens)

    def accumulate(self):
        self._tokenization[-1].accumulate(self.match)
        self._appended = 1

    def less(self, l=1):
        self.matchLen = self.matchLen - l
        assert(self.matchLen >= 0)
        self.match = self._s[self._pos:self._pos+self.matchLen]

    def more(self, l=1):
        self.matchLen = self.matchLen + l
        self.match = self._s[self._pos:self._pos+self.matchLen]

    def rest(self):
        return self._s[self.matchLen+self._pos:]

    def setType(self, t):
        self.type = t
        self._lex = self._scanner.DistinctLexemes[t]

    def enterState(self, s):
        state = self._scanner.States.get(s,None)
        if state is None:
            raise ScanError("Scanner has no state", s)
        self._state = state

    def error(self, msg, fatality="FATAL"):
        """Reports an error during the scanning process.  Three levels are
           possible:
                FATAL: stop immediately.
                ERROR: continue, but do not go on to parsing.
                WARN: print an error, but do not prevent parsing."""
	if fatality == "FATAL":
	    raise ScanError(msg)
	elif fatality == "ERROR":
	    print "Scan error:", msg
	elif fatality == "WARN":
	    print "Warning:", msg
        else:
            assert 'line' is not 'reached'

    def becomeSpace(self):
	self._lex = None

    def reinterpret(self, tokenType, matching, as):
        """Adds a reinterpretation to this scanner.

           tokenType - The type to consider reinterpreting.
           matching - The pattern or predicate to match.
           as - The new type of all matching tokens."""


        # Compile the regex, if necessary
        if type(matching) is StringType:
            matching = lambda s, p=re.compile(matching): p.match(s)
        elif isinstance(matching, re.RegexObject):
            matching = lambda s, p=matching: p.match(s)
                
	as = self._scanner.DistinctLexemes[as]
	if self._reinterprets.has_key(tokenType):
	    self._reinterprets[tokenType].insert(0, (matching,as))
	else:
	    self._reinterprets[tokenType] = [(matching,as)]
