git stults online reflex / master rx-lex.l
master

Tree @master (Download .tar.gz)

rx-lex.l @masterraw · history · blame

%{
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include "rx-limits.h"
/* bison token types, str_reverse come from here */
#include "rx-parse.h"
    
/* TODO replace this with an error handler that saves off the parser error in an
 * extra type so that it can bubble up through the parser. */
#define ERROR(...) fprintf(stderr, __VA_ARGS__)

/* emits a token by its name, which is an integer token designator from the
 * parser; the semantic value of the token is the token itself. */
#define EMIT_AS(tok)					\
{							\
	yylval->token = tok;				\
	return tok;					\
}							\
    
/* emits the first char of the current lexer text as a bison (integer); this is
 * also taken to be its semantic value. */
#define EMIT					\
{						\
    int t = yyget_text(yyscanner)[0];		\
    yylval->token = t;				\
    return t;					\
}						\

/* emits a token of type ID with semantic content a raw (utf8) byte sequence */
#define EMIT_ID						\
{							\
    char *str = strndup(yyget_text(yyscanner), ID_MAX);	\
    yylval->string = str;				\
    return ID;						\
							\
}							\

/* like EMIT_ID, except that leading and trailing " are assumed and stripped off
 * to recover the semantic value. TODO have to think about data segmentation if
 * this us UTF 8 -- one char in front and back might not work to remove the
 * "s */
#define EMIT_STR						\
{								\
    size_t len = strnlen(yyget_text(yyscanner), STR_MAX);	\
    char *str = strndup(yyget_text(yyscanner) + 1, len - 2);	\
    yylval->string = str;					\
    return STR;							\
}								\

/* TODO make the following less redundant. */
#define EMIT_BOOL(val)				\
{						\
    yylval->boolean = val;			\
    return BOOL;				\
}						\

/* emits a string that is parsed as an int, but depending on the direction of
 * the block might be left or right, requiring a reversal before converting to
 * a c type. this context can only be obtained by the parser. */
#define EMIT_UNCONVERTED_INT					\
{								\
    size_t len = strnlen(yyget_text(yyscanner), STR_MAX);	\
    char *str = strndup(yyget_text(yyscanner), len);		\
    yylval->string = str;					\
    return UNCONVERTED_INT;					\
}								\

#define EMIT_RIGHT_INT					\
{							\
    yylval->integer = atoi(yyget_text(yyscanner));	\
    return RIGHT_INT;					\
}							\

#define EMIT_LEFT_INT						       \
{								       \
    char *str = str_reverse(strndup(yyget_text(yyscanner), STR_MAX));  \
    yylval->integer = atoi(str);				       \
    return LEFT_INT;						       \
}								       \

/* emits a string that is parsed as a float, but depending on the direction of
 * the block might be left or right, requiring a reversal before converting to
 * a c type. this context can only be obtained by the parser. */
#define EMIT_UNCONVERTED_FLOAT					\
{								\
    size_t len = strnlen(yyget_text(yyscanner), STR_MAX);	\
    char *str = strndup(yyget_text(yyscanner), len);		\
    yylval->string = str;					\
    return UNCONVERTED_FLOAT;					\
}								\

#define EMIT_RIGHT_FLOAT				\
{							\
    yylval->floating = atof(yyget_text(yyscanner));	\
    return RIGHT_FLOAT;					\
}							\

#define EMIT_LEFT_FLOAT						       \
{								       \
    char *str = str_reverse(strndup(yyget_text(yyscanner), STR_MAX));  \
    yylval->floating = atof(str);				       \
    return LEFT_FLOAT;						       \
}								       \
    
%}

WHITESPACE [ \t]

LX_LEGAL_FIRST [^_:;+\-*/%|&\^!><=()@\\#$'"?\{\},.~`\[\]\r\n\t\0 ]
LX_LEGAL       [^:;+\-*/%|&\^!><=()@\\#$'"?\{\},.~`\[\]\r\n\t\0 ]
LX_ID          {LX_LEGAL_FIRST}{LX_LEGAL}*

LX_RIGHT_ARROW "->"
LX_LEFT_ARROW "<-"
LX_RIGHT_BIG_ARROW "==>"
LX_LEFT_BIG_ARROW "<=="
LX_RIGHT_RW ">>"
LX_LEFT_RW "<<"
LX_ETC ".."

LX_STR_TYPE \"\"
LX_BOOL_TYPE \?
LX_INT_TYPE '
LX_FLOAT_TYPE \.
LX_LIST_TYPE \[\]
LX_STRUCT_TYPE \{\}
LX_XFORM_TYPE ~

LX_RIGHT_EXP  [eE][+\-]?[0-9]+
LX_RIGHT_BIN  0[bB][01]+
LX_RIGHT_HEX  0[xX][0-9A-Fa-f]+
LX_RIGHT_OCT  [oO][0-7]+
LX_LEFT_EXP  [0-9]+[+\-]?[eE]
LX_LEFT_BIN  [01]+[bB]0
LX_LEFT_HEX  [0-9A-Fa-f]+[xX]0
LX_LEFT_OCT  [0-7]+[oO]

LX_DIG  [0-9]

LX_BOOL_TRUE "!!"
LX_BOOL_FALSE "??"

LX_UNCONVERTED_INT {LX_DIG}+
LX_RIGHT_INT ({LX_RIGHT_BIN}|{LX_RIGHT_HEX}|{LX_RIGHT_OCT})
LX_LEFT_INT  ({LX_LEFT_BIN}|{LX_LEFT_HEX}|{LX_LEFT_OCT})

LX_UNCONVERTED_FLOAT ({LX_DIG}*[.]{LX_DIG}+)|({LX_DIG}+[.]{LX_DIG}*)
LX_RIGHT_FLOAT ({LX_DIG}*[.]{LX_DIG}+{LX_RIGHT_EXP}|{LX_DIG}+[.]?{LX_DIG}*{LX_RIGHT_EXP})
LX_LEFT_FLOAT  ({LX_LEFT_EXP}{LX_DIG}+[.]{LX_DIG}*|{LX_LEFT_EXP}{LX_DIG}*[.]?{LX_DIG}+)

LX_STR_INTERNAL [^"\r\n\0]
LX_STR \"{LX_STR_INTERNAL}+\"

%option reentrant 
%option bison-locations
%option nounput noinput noyy_top_state noyywrap
%option yylineno
%option stack

%x comment

%%

"/*"                    { yy_push_state(comment, yyscanner); }
<comment>[^\\]"*/"\s*(\r|\n|\r\n)? { yy_pop_state(yyscanner); /* TODO fix this hack */ }
<comment>.              /* eat everything else in a comment block */

{LX_RIGHT_ARROW}       	{ EMIT_AS(RIGHT_ARROW); }
{LX_LEFT_ARROW}        	{ EMIT_AS(LEFT_ARROW); }
{LX_RIGHT_BIG_ARROW}   	{ EMIT_AS(RIGHT_BIG_ARROW); }
{LX_LEFT_BIG_ARROW}    	{ EMIT_AS(LEFT_BIG_ARROW); }
{LX_RIGHT_RW}          	{ EMIT_AS(RIGHT_RW); }
{LX_LEFT_RW}           	{ EMIT_AS(LEFT_RW); }
{LX_ETC}               	{ EMIT_AS(ETC); }

{LX_STR_TYPE}          	{ EMIT_AS(STR_TYPE); }
{LX_BOOL_TYPE}         	{ EMIT_AS(BOOL_TYPE); }
{LX_INT_TYPE}          	{ EMIT_AS(INT_TYPE); }
{LX_FLOAT_TYPE}        	{ EMIT_AS(FLOAT_TYPE); }
{LX_LIST_TYPE}         	{ EMIT_AS(LIST_TYPE); }
{LX_STRUCT_TYPE}       	{ EMIT_AS(STRUCT_TYPE); }
{LX_XFORM_TYPE}        	{ EMIT_AS(XFORM_TYPE); }

{LX_BOOL_TRUE}         	{ EMIT_BOOL(true); }
{LX_BOOL_FALSE}        	{ EMIT_BOOL(false); }
{LX_UNCONVERTED_INT} 	{ EMIT_UNCONVERTED_INT; }
{LX_RIGHT_INT}         	{ EMIT_RIGHT_INT; }
{LX_LEFT_INT}          	{ EMIT_LEFT_INT; }
{LX_UNCONVERTED_FLOAT}  { EMIT_UNCONVERTED_FLOAT; }
{LX_RIGHT_FLOAT}       	{ EMIT_RIGHT_FLOAT; }
{LX_LEFT_FLOAT}        	{ EMIT_LEFT_FLOAT; }

{LX_STR}                { EMIT_STR; }

{LX_ID}                 { EMIT_ID; }

":"  { EMIT_AS(QUALIFY); }
"("  { EMIT_AS(LEFT_PAREN); }
")"  { EMIT_AS(RIGHT_PAREN); }

"["  { EMIT; }
"]"  { EMIT; }
"{"  { EMIT; }
"}"  { EMIT; }

"_"  { EMIT; }
"+"  { EMIT; }
"-"  { EMIT; }
"*"  { EMIT; }
"/"  { EMIT; }
"%"  { EMIT; }
"|"  { EMIT; }
"&"  { EMIT; }
"^"  { EMIT; }
"!"  { EMIT; }
"="  { EMIT; }
">"  { EMIT; }
"<"  { EMIT; }
">=" { EMIT_AS(RIGHT_GEQ); }
"=>" { EMIT_AS(LEFT_GEQ); }
"<=" { EMIT_AS(RIGHT_LEQ); }
"=<" { EMIT_AS(LEFT_LEQ); }
"!=" { EMIT_AS(RIGHT_NE); }
"=!" { EMIT_AS(LEFT_NE); }
"==" { EMIT_AS(EQ); }

[\n\r]+ /* eat newlines */

{WHITESPACE} /* eat whitespace */

. { /* pass everything else through to the parser so that errors and reportage
       can all happen in one place*/
    EMIT;
  }

%%