/*
 * A C PreProcessor
 * 
 * Handles #define/#undef, #ifdef/#elsif/#else/#endif, and #include using only
 * an ANTLR lexer (actually a stack of them).  This could be easily integrated
 * with an existing lexer to do preprocessing and tokenizing all at once.
 * 
 * Author: Eric Mahurin - eric_mahurin at yahoo dot com
 * License: just give me credit
 * 
 * BUG: missing some of the simpler directives
 * BUG: doesn't follow the cpp spec perfectly - haven't made any effort at
 * this not well tested
 * 
 * Be aware that this is my first real attempt at both ANTLR and Java, so
 * things may not be done the best way.  I welcome suggestions and fixes.
 *
 * 041124 - cpp.g translated and adapted as Python example by MK.
 */

header {
import sys
import StringIO
}

header "__main__" {

import traceback

class cpp:

    def __init__(self, *args):
        try:
            // will need a stack of lexers for #include and macro calls
            self.mainLexer = Lexer(sys.stdin)
            Lexer.selector.select(self.mainLexer)
            for token in Lexer.selector:
                sys.stdout.write(token.getText())
        except Exception, e:
            sys.stderr.write("exception: " + str(e) + '\n')
            traceback.print_exc()

Lexer.selector = antlr.TokenStreamSelector()

cpp(sys.argv[1:])

}

options {
    language="Python";
}

class cppLexer extends Lexer;

options {
    testLiterals = false;
    k = 4;
}

tokens {
    ENDIF ;
}

{
    selector = antlr.TokenStreamSelector()     // must be assigned externally

    ifState = 1         // -1: no-else false, 0: false, 1: true
    ifStates = []       // holds nested if conditions
    defines = {}        // holds the defines
    defineArgs = {}     // holds the args for a macro call

    def uponEOF(self):
        if Lexer.selector.getCurrentStream() != Lexer:
            try:
                Lexer.selector.pop() // return to old lexer/stream
                Lexer.selector.retry()
            //except antlr.TokenStreamRetryException, tsre:
            //    raise tsre
            except IndexError:
                // return a real EOF if nothing in stack
                pass

}

DIRECTIVE {
    args = []
    condition = True
} : '#'
    ( "include" (WS)? includeFile:STRING
      {
        if Lexer.ifState == 1:
            name = includeFile.getText()
            name = name[1:-1]
            try:
                sublexer = Lexer(file(name))
        // want defines to be persistent
                sublexer.defines = Lexer.defines
                sublexer.setFilename(name)
                Lexer.selector.push(sublexer)
                Lexer.selector.retry()
            except IOError, e:
                sys.stderr.write("cannot find file " + name + '\n')
      }
    | "define" WS defineMacro:RAW_IDENTIFIER
      {
          // first element will hold the macro text
      }
      (
        ( '(' // get arguments if you find them (no spaces before left paren)
          (WS)? defineArg0:RAW_IDENTIFIER (WS)?
          { args += defineArg0.getText() }
          ( COMMA (WS)? defineArg1:RAW_IDENTIFIER (WS)?
            { args += defineArg1.getText() } )*
          ')'
        | ' '|'\t'|'\f'
        )
        ( options { greedy=true; } : ' '|'\t'|'\f' )*
        // store the text verbatim - tokenize when called
        defineText:MACRO_TEXT { args[0] = defineText.getText() }
      )? ('\n'|"\r\n"|'\r') { $newline }
      {
          if Lexer.ifState == 1:
              Lexer.defines[defineMacro.getText()] = args
              $skip
      }
    | "undef" WS undefMacro:RAW_IDENTIFIER
      {
          if Lexer.ifState == 1:
              del Lexer.defines[undefMacro.getText()]
              $skip
      }
    | ( "ifdef" | "ifndef" { condition=False } )
      WS ifMacro:RAW_IDENTIFIER
      {
        Lexer.ifStates.append(ifState)
        if Lexer.ifState == 1:
            if Lexer.defines.has_key(ifMacro.getText()) == condition:
                Lexer.ifState = 1
            else:
                Lexer.ifState = 0
        else:
            Lexer.ifState = -1
        if Lexer.ifState == 1:
            $skip
        else:
            // gobble up tokens until ENDIF (could be caused by else)
            while True:
                try:
                    if Lexer.selector.nextToken().getType() == ENDIF:
                        break
                except antlr.TokenStreamRetryException, r:
                    // just continue if someone tried retry
                    pass
            // retry in case we switched lexers
            Lexer.selector.retry()
      }
    | ( "else" // treat like elsif (true)
      | "elsif" WS elsifMacro:RAW_IDENTIFIER
        {
            condition = Lexer.defines.has_key(elsifMacro.getText())
        }
      )
      {
        if Lexer.ifState == 1:
            // previous if/elsif was taken - discard rest
            Lexer.ifState = -1;
            while True:
                try:
                    if Lexer.selector.nextToken().getType() == ENDIF:
                        break
                except antlr.TokenStreamRetryException, r:
                    // just continue if someone tried retry
                    pass
            // retry in case we switched lexers
            Lexer.selector.retry()
        elif Lexer.ifState == 0 and condition:
            // "elsif" (true) or "else"
            $setType(ENDIF)
            Lexer.ifState = 1
      }
    | "endif"
      {
        if Lexer.ifState == 1:
            condition = True
        else:
            condition = False
        try:
            // return to previous if state
            del Lexer.ifStates[-1]
            if condition:
                $skip
            else:
                // tell if/else/elsif to stop discarding tokens
                $setType(ENDIF)
        except IndexError, e:
            // endif with no if
            pass
      }
    );

IDENTIFIER options { testLiterals=true; } {
    define = []
    args = []
} :
    identifier:RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = Lexer.defineArgs.has_key(identifier.getText())
        if define:
            define = Lexer.defineArgs[identifier.getText()]
        elif _begin == 0 and not define:
            // see if this is a macro call
            define = Lexer.defines.has_key(identifier.getText())
            if define:
                define = Lexer.defines[identifier.getText()]
    }
    ( { define and len(define) }? ( WS | COMMENT )?
        // take in arguments if macro call requires them
        '('
        callArg0:EXPR { args += callArg0.getText() }
        ( COMMA callArg1:EXPR { args += callArg1.getText() } )*
        { len(args) == len(define)-1 }? // better have right amount
        ')'
    | { not (define and len(define)) }?
    )
    {
      if define:
          defineText = define[0]
          if _begin:
              // just substitute text if called from EXPR - no token created
              $setText(defineText)
          else:
              // create a new lexer to handle the macro text
              sublexer = Lexer(StringIO.StringIO(defineText))
              for i in range(len(args)):
                  // treat macro arguments similar to local defines
                  arg = []
                  arg.append(args[i])
                  sublexer.defineArgs[define[1+i]] = arg
              Lexer.selector.push(sublexer)
              // retry in new lexer
              Lexer.selector.retry()
    };

STRING
    : '"' ( '\\' . | ~('\\'|'"') )* '"' // double quoted string
    | '\'' ( '\\' . | ~('\\'|'\'') )* '\'' // single quoted string
    ;

protected
MACRO_TEXT :
    ( '\\'! NL { $newline } // escaped newline
    | ~('\n'|'\r')
    )*
    ;

protected
NL
options {
    generateAmbigWarnings=false; // single '\r' is ambig with '\r' '\n'
}
	: '\r'
	| '\n'
	| '\r' '\n'
    ;

WS :
    ( ' '
    | '\t'
    | '\f'
    | NL { $newline }
    ) { /* $skip */ }
    ;

COMMENT :
    ( "//" (~('\n'|'\r'))* NL { $newline } // single line comment
    | "/*" ( options{greedy=false;} : NL { $newline } | ~('\n'|'\r') )* "*/"
      // multi-line comment
    ) { /* $skip */ }
    ;

protected
RAW_IDENTIFIER :
    ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9')*
    ;

NUMBER :
    ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')*
    // allow alpha suffixes on numbers (i.e. L:long)
    ;

// group symbols into categories to parse EXPR
LEFT  : '(' | '[' | '{' ;
RIGHT : ')' | ']' | '}' ;
COMMA : ',' ;
OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ;

protected
EXPR // allow just about anything without being ambiguous
    : (WS)? (NUMBER|IDENTIFIER)?
      (
        ( LEFT EXPR ( COMMA EXPR )* RIGHT
        | STRING
        | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here
        )
        EXPR
      )?
    ;