# See: Tieins.eyp # See: http://www.gnu.org/software/bison/manual/html_mono/bison.html#Lexical-Tie_002dins # for a more detailed explanation. %strict %token ID INT INTEGER %syntactic token HEX %{ my %st; %} %lexer { my $hexflag = $self->{HEXFLAG}; m{\G\s*(\#.*)?}gc; m{\G(HEX\b|INT\b)}igc and return (uc($1), $1); m{(\G\d+)}gc and return ('INTEGER', $hexflag? hex($1) : $1); m{\G([a-zA-Z_]\w*)}gc and do { my $match = $1; $hexflag and !exists($st{$match}) and $match =~ m{^([A-F0-9]+$)}gc and return ('INTEGER', hex($match)); return ('ID', $1); }; m{\G(.)}gc and return ($1, $1); return('',undef); } %right '=' %left '+' %tree bypass alias %% stmt: decl <* ';'> expr <%name EXPS + ';'> { # make the symbol table an attribute # of the root node $_[2]->{st} = { %st }; $_[2]; } ; decl: INT ID <+ ','> { # insert identifiers in the symbol table $st{$_->{attr}} = 1 for $_[2]->children(); } ; expr: %name ID ID | %name NUM INTEGER | %name HEX HEX '(' { $_[0]->{HEXFLAG} = 1; } $expr ')' { $_[0]->{HEXFLAG} = 0; $expr; } | %name ASSIGN id '=' expr | %name PLUS expr '+' expr ; id : ID ; %% # Context-dependant lexer =head1 SYNOPSIS Compile it with: $ eyapp -C SemanticInfoInTokens.eyp Run it with: $ ./SemanticInfoInTokens.pm -t -i -f inputforsemanticinfo.txt try also: ./SemanticInfoInTokens.pm -t -i -f inputforsemanticinfo2.txt =head1 THE TYPENAME-IDENTIFIER PROBLEM WHEN PARSING THE C<C> LANGUAGE The C language has a context dependency: the way an identifier is used depends on what its current meaning is. For example, consider this: T(x); This looks like a function call statement, but if C<T> is a typedef name, then this is actually a declaration of C<x>. How can a parser for C decide how to parse this input? Here is another example: { T * x; ... } What is this, a declaration of C<x> as a pointer to C<T>, or a void multiplication of the variables C<T> and C<x>? The usual method to solve this problem is to have two different token types, C<ID> and C<TYPENAME>. When the lexer finds an identifier, it looks up in the symbol table the current declaration of the identifier in order to decide which token type to return: C<TYPENAME> if the identifier is declared as a typedef, C<ID> otherwise. =head1 THIS EXAMPLE One way to handle context-dependency is the lexical tie-in: a flag which is set by the semantic actions, whose purpose is to alter the way tokens are parsed. In this "Calc"-like example we have a language with a special construct C<hex (hex-expr)>. After the keyword C<hex> comes an C<expression> in parentheses in which all integers are hexadecimal. In particular, strings in C</[A-F0-9]+/> like C<A1B> must be treated as an hex integer unless they were previously declared. Here the lexer looks at the value of the hexflag attribute; when it is nonzero, all integers are parsed in hexadecimal, and tokens starting with letters are parsed as integers if possible. =head1 SEE ALSO =over 2 =item * File: Tieins.eyp =item * L<http://www.gnu.org/software/bison/manual/html_mono/bison.html#Lexical-Tie_002dins> =item * L<http://en.wikipedia.org/wiki/The_lexer_hack> =item * L<http://eli.thegreenplace.net/2007/11/24/the-context-sensitivity-of-cs-grammar/> =back