scripts/genksyms/lex.l - linux - Git at Google

 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * Lexical analysis for genksyms.
  * Copyright 1996, 1997 Linux International.
  *
  * New implementation contributed by Richard Henderson <rth@tamu.edu>
  * Based on original work by Bjorn Ekwall <bj0rn@blox.se>
  *
  * Taken from Linux modutils 2.4.22.
  */

 %{

 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>

 #include "genksyms.h"
 #include "parse.tab.h"

 /* We've got a two-level lexer here.  We let flex do basic tokenization
    and then we categorize those basic tokens in the second stage.  */
 #define YY_DECL		static int yylex1(void)

 %}

 IDENT			[A-Za-z_\$][A-Za-z0-9_\$]*

 O_INT			0[0-7]*
 D_INT			[1-9][0-9]*
 X_INT			0[Xx][0-9A-Fa-f]+
 I_SUF			[Uu]|[Ll]|[Uu][Ll]|[Ll][Uu]
 INT			({O_INT}|{D_INT}|{X_INT}){I_SUF}?

 FRAC			([0-9]*\.[0-9]+)|([0-9]+\.)
 EXP			[Ee][+-]?[0-9]+
 F_SUF			[FfLl]
 REAL			({FRAC}{EXP}?{F_SUF}?)|([0-9]+{EXP}{F_SUF}?)

 STRING			L?\"([^\\\"]*\\.)*[^\\\"]*\"
 CHAR			L?\'([^\\\']*\\.)*[^\\\']*\'

 MC_TOKEN		([~%^&*+=|<>/-]=)|(&&)|("||")|(->)|(<<)|(>>)

 /* We don't do multiple input files.  */
 %option noyywrap

 %option noinput

 %%


  /* Keep track of our location in the original source files.  */
 ^#[ \t]+{INT}[ \t]+\"[^\"\n]+\".*\n	return FILENAME;
 ^#.*\n					cur_line++;
 \n					cur_line++;

  /* Ignore all other whitespace.  */
 [ \t\f\v\r]+				;


 {STRING}				return STRING;
 {CHAR}					return CHAR;
 {IDENT}					return IDENT;

  /* The Pedant requires that the other C multi-character tokens be
     recognized as tokens.  We don't actually use them since we don't
     parse expressions, but we do want whitespace to be arranged
     around them properly.  */
 {MC_TOKEN}				return OTHER;
 {INT}					return INT;
 {REAL}					return REAL;

 "..."					return DOTS;

  /* All other tokens are single characters.  */
 .					return yytext[0];


 %%

 /* Bring in the keyword recognizer.  */

 #include "keywords.c"


 /* Macros to append to our phrase collection list.  */

 /*
  * We mark any token, that that equals to a known enumerator, as
  * SYM_ENUM_CONST. The parser will change this for struct and union tags later,
  * the only problem is struct and union members:
  *    enum e { a, b }; struct s { int a, b; }
  * but in this case, the only effect will be, that the ABI checksums become
  * more volatile, which is acceptable. Also, such collisions are quite rare,
  * so far it was only observed in include/linux/telephony.h.
  */
 #define _APP(T,L)	do {						   \
 			  cur_node = next_node;				   \
 			  next_node = xmalloc(sizeof(*next_node));	   \
 			  next_node->next = cur_node;			   \
 			  cur_node->string = memcpy(xmalloc(L+1), T, L+1); \
 			  cur_node->tag =				   \
 			    find_symbol(cur_node->string, SYM_ENUM_CONST, 1)?\
 			    SYM_ENUM_CONST : SYM_NORMAL ;		   \
 			  cur_node->in_source_file = in_source_file;       \
 			} while (0)

 #define APP		_APP(yytext, yyleng)


 /* The second stage lexer.  Here we incorporate knowledge of the state
    of the parser to tailor the tokens that are returned.  */

 int
 yylex(void)
 {
   static enum {
     ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_TYPEOF, ST_TYPEOF_1,
     ST_BRACKET, ST_BRACE, ST_EXPRESSION, ST_STATIC_ASSERT,
   } lexstate = ST_NOTSTARTED;

   static int suppress_type_lookup, dont_want_brace_phrase;
   static struct string_list *next_node;
   static char *source_file;

   int token, count = 0;
   struct string_list *cur_node;

   if (lexstate == ST_NOTSTARTED)
     {
       next_node = xmalloc(sizeof(*next_node));
       next_node->next = NULL;
       lexstate = ST_NORMAL;
     }

 repeat:
   token = yylex1();

   if (token == 0)
     return 0;
   else if (token == FILENAME)
     {
       char *file, *e;

       /* Save the filename and line number for later error messages.  */

       if (cur_filename)
 	free(cur_filename);

       file = strchr(yytext, '\"')+1;
       e = strchr(file, '\"');
       *e = '\0';
       cur_filename = memcpy(xmalloc(e-file+1), file, e-file+1);
       cur_line = atoi(yytext+2);

       if (!source_file) {
         source_file = xstrdup(cur_filename);
         in_source_file = 1;
       } else {
         in_source_file = (strcmp(cur_filename, source_file) == 0);
       }

       goto repeat;
     }

   switch (lexstate)
     {
     case ST_NORMAL:
       switch (token)
 	{
 	case IDENT:
 	  APP;
 	  {
 	    int r = is_reserved_word(yytext, yyleng);
 	    if (r >= 0)
 	      {
 		switch (token = r)
 		  {
 		  case ATTRIBUTE_KEYW:
 		    lexstate = ST_ATTRIBUTE;
 		    count = 0;
 		    goto repeat;
 		  case ASM_KEYW:
 		    lexstate = ST_ASM;
 		    count = 0;
 		    goto repeat;
 		  case TYPEOF_KEYW:
 		    lexstate = ST_TYPEOF;
 		    count = 0;
 		    goto repeat;

 		  case STRUCT_KEYW:
 		  case UNION_KEYW:
 		  case ENUM_KEYW:
 		    dont_want_brace_phrase = 3;
 		    suppress_type_lookup = 2;
 		    goto fini;

 		  case EXPORT_SYMBOL_KEYW:
 		      goto fini;

 		  case STATIC_ASSERT_KEYW:
 		    lexstate = ST_STATIC_ASSERT;
 		    count = 0;
 		    goto repeat;
 		  }
 	      }
 	    if (!suppress_type_lookup)
 	      {
 		if (find_symbol(yytext, SYM_TYPEDEF, 1))
 		  token = TYPE;
 	      }
 	  }
 	  break;

 	case '[':
 	  APP;
 	  lexstate = ST_BRACKET;
 	  count = 1;
 	  goto repeat;

 	case '{':
 	  APP;
 	  if (dont_want_brace_phrase)
 	    break;
 	  lexstate = ST_BRACE;
 	  count = 1;
 	  goto repeat;

 	case '=': case ':':
 	  APP;
 	  lexstate = ST_EXPRESSION;
 	  break;

 	default:
 	  APP;
 	  break;
 	}
       break;

     case ST_ATTRIBUTE:
       APP;
       switch (token)
 	{
 	case '(':
 	  ++count;
 	  goto repeat;
 	case ')':
 	  if (--count == 0)
 	    {
 	      lexstate = ST_NORMAL;
 	      token = ATTRIBUTE_PHRASE;
 	      break;
 	    }
 	  goto repeat;
 	default:
 	  goto repeat;
 	}
       break;

     case ST_ASM:
       APP;
       switch (token)
 	{
 	case '(':
 	  ++count;
 	  goto repeat;
 	case ')':
 	  if (--count == 0)
 	    {
 	      lexstate = ST_NORMAL;
 	      token = ASM_PHRASE;
 	      break;
 	    }
 	  goto repeat;
 	default:
 	  goto repeat;
 	}
       break;

     case ST_TYPEOF_1:
       if (token == IDENT)
 	{
 	  if (is_reserved_word(yytext, yyleng) >= 0
 	      || find_symbol(yytext, SYM_TYPEDEF, 1))
 	    {
 	      yyless(0);
 	      unput('(');
 	      lexstate = ST_NORMAL;
 	      token = TYPEOF_KEYW;
 	      break;
 	    }
 	  _APP("(", 1);
 	}
 	lexstate = ST_TYPEOF;
 	/* FALLTHRU */

     case ST_TYPEOF:
       switch (token)
 	{
 	case '(':
 	  if ( ++count == 1 )
 	    lexstate = ST_TYPEOF_1;
 	  else
 	    APP;
 	  goto repeat;
 	case ')':
 	  APP;
 	  if (--count == 0)
 	    {
 	      lexstate = ST_NORMAL;
 	      token = TYPEOF_PHRASE;
 	      break;
 	    }
 	  goto repeat;
 	default:
 	  APP;
 	  goto repeat;
 	}
       break;

     case ST_BRACKET:
       APP;
       switch (token)
 	{
 	case '[':
 	  ++count;
 	  goto repeat;
 	case ']':
 	  if (--count == 0)
 	    {
 	      lexstate = ST_NORMAL;
 	      token = BRACKET_PHRASE;
 	      break;
 	    }
 	  goto repeat;
 	default:
 	  goto repeat;
 	}
       break;

     case ST_BRACE:
       APP;
       switch (token)
 	{
 	case '{':
 	  ++count;
 	  goto repeat;
 	case '}':
 	  if (--count == 0)
 	    {
 	      lexstate = ST_NORMAL;
 	      token = BRACE_PHRASE;
 	      break;
 	    }
 	  goto repeat;
 	default:
 	  goto repeat;
 	}
       break;

     case ST_EXPRESSION:
       switch (token)
 	{
 	case '(': case '[': case '{':
 	  ++count;
 	  APP;
 	  goto repeat;
 	case '}':
 	  /* is this the last line of an enum declaration? */
 	  if (count == 0)
 	    {
 	      /* Put back the token we just read so's we can find it again
 		 after registering the expression.  */
 	      unput(token);

 	      lexstate = ST_NORMAL;
 	      token = EXPRESSION_PHRASE;
 	      break;
 	    }
 	  /* FALLTHRU */
 	case ')': case ']':
 	  --count;
 	  APP;
 	  goto repeat;
 	case ',': case ';':
 	  if (count == 0)
 	    {
 	      /* Put back the token we just read so's we can find it again
 		 after registering the expression.  */
 	      unput(token);

 	      lexstate = ST_NORMAL;
 	      token = EXPRESSION_PHRASE;
 	      break;
 	    }
 	  APP;
 	  goto repeat;
 	default:
 	  APP;
 	  goto repeat;
 	}
       break;

     case ST_STATIC_ASSERT:
       APP;
       switch (token)
 	{
 	case '(':
 	  ++count;
 	  goto repeat;
 	case ')':
 	  if (--count == 0)
 	    {
 	      lexstate = ST_NORMAL;
 	      token = STATIC_ASSERT_PHRASE;
 	      break;
 	    }
 	  goto repeat;
 	default:
 	  goto repeat;
 	}
       break;

     default:
       exit(1);
     }
 fini:

   if (suppress_type_lookup > 0)
     --suppress_type_lookup;
   if (dont_want_brace_phrase > 0)
     --dont_want_brace_phrase;

   yylval = &next_node->next;

   return token;
 }
	/* SPDX-License-Identifier: GPL-2.0-or-later */
	/*
	* Lexical analysis for genksyms.
	* Copyright 1996, 1997 Linux International.
	*
	* New implementation contributed by Richard Henderson <rth@tamu.edu>
	* Based on original work by Bjorn Ekwall <bj0rn@blox.se>
	*
	* Taken from Linux modutils 2.4.22.
	*/

	%{

	#include <limits.h>
	#include <stdlib.h>
	#include <string.h>
	#include <ctype.h>

	#include "genksyms.h"
	#include "parse.tab.h"

	/* We've got a two-level lexer here. We let flex do basic tokenization
	and then we categorize those basic tokens in the second stage. */
	#define YY_DECL static int yylex1(void)

	%}

	IDENT [A-Za-z_\$][A-Za-z0-9_\$]*

	O_INT 0[0-7]*
	D_INT [1-9][0-9]*
	X_INT 0[Xx][0-9A-Fa-f]+
	I_SUF [Uu]\|[Ll]\|[Uu][Ll]\|[Ll][Uu]
	INT ({O_INT}\|{D_INT}\|{X_INT}){I_SUF}?

	FRAC ([0-9]*\.[0-9]+)\|([0-9]+\.)
	EXP [Ee][+-]?[0-9]+
	F_SUF [FfLl]
	REAL ({FRAC}{EXP}?{F_SUF}?)\|([0-9]+{EXP}{F_SUF}?)

	STRING L?\"([^\\\"]\\.)[^\\\"]*\"
	CHAR L?\'([^\\\']\\.)[^\\\']*\'

	MC_TOKEN ([~%^&*+=\|<>/-]=)\|(&&)\|("\|\|")\|(->)\|(<<)\|(>>)

	/* We don't do multiple input files. */
	%option noyywrap

	%option noinput

	%%


	/* Keep track of our location in the original source files. */
	^#[ \t]+{INT}[ \t]+\"[^\"\n]+\".*\n return FILENAME;
	^#.*\n cur_line++;
	\n cur_line++;

	/* Ignore all other whitespace. */
	[ \t\f\v\r]+ ;


	{STRING} return STRING;
	{CHAR} return CHAR;
	{IDENT} return IDENT;

	/* The Pedant requires that the other C multi-character tokens be
	recognized as tokens. We don't actually use them since we don't
	parse expressions, but we do want whitespace to be arranged
	around them properly. */
	{MC_TOKEN} return OTHER;
	{INT} return INT;
	{REAL} return REAL;

	"..." return DOTS;

	/* All other tokens are single characters. */
	. return yytext[0];


	%%

	/* Bring in the keyword recognizer. */

	#include "keywords.c"


	/* Macros to append to our phrase collection list. */

	/*
	* We mark any token, that that equals to a known enumerator, as
	* SYM_ENUM_CONST. The parser will change this for struct and union tags later,
	* the only problem is struct and union members:
	* enum e { a, b }; struct s { int a, b; }
	* but in this case, the only effect will be, that the ABI checksums become
	* more volatile, which is acceptable. Also, such collisions are quite rare,
	* so far it was only observed in include/linux/telephony.h.
	*/
	#define _APP(T,L) do { \
	cur_node = next_node; \
	next_node = xmalloc(sizeof(*next_node)); \
	next_node->next = cur_node; \
	cur_node->string = memcpy(xmalloc(L+1), T, L+1); \
	cur_node->tag = \
	find_symbol(cur_node->string, SYM_ENUM_CONST, 1)?\
	SYM_ENUM_CONST : SYM_NORMAL ; \
	cur_node->in_source_file = in_source_file; \
	} while (0)

	#define APP _APP(yytext, yyleng)


	/* The second stage lexer. Here we incorporate knowledge of the state
	of the parser to tailor the tokens that are returned. */

	int
	yylex(void)
	{
	static enum {
	ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_TYPEOF, ST_TYPEOF_1,
	ST_BRACKET, ST_BRACE, ST_EXPRESSION, ST_STATIC_ASSERT,
	} lexstate = ST_NOTSTARTED;

	static int suppress_type_lookup, dont_want_brace_phrase;
	static struct string_list *next_node;
	static char *source_file;

	int token, count = 0;
	struct string_list *cur_node;

	if (lexstate == ST_NOTSTARTED)
	{
	next_node = xmalloc(sizeof(*next_node));
	next_node->next = NULL;
	lexstate = ST_NORMAL;
	}

	repeat:
	token = yylex1();

	if (token == 0)
	return 0;
	else if (token == FILENAME)
	{
	char file, e;

	/* Save the filename and line number for later error messages. */

	if (cur_filename)
	free(cur_filename);

	file = strchr(yytext, '\"')+1;
	e = strchr(file, '\"');
	*e = '\0';
	cur_filename = memcpy(xmalloc(e-file+1), file, e-file+1);
	cur_line = atoi(yytext+2);

	if (!source_file) {
	source_file = xstrdup(cur_filename);
	in_source_file = 1;
	} else {
	in_source_file = (strcmp(cur_filename, source_file) == 0);
	}

	goto repeat;
	}

	switch (lexstate)
	{
	case ST_NORMAL:
	switch (token)
	{
	case IDENT:
	APP;
	{
	int r = is_reserved_word(yytext, yyleng);
	if (r >= 0)
	{
	switch (token = r)
	{
	case ATTRIBUTE_KEYW:
	lexstate = ST_ATTRIBUTE;
	count = 0;
	goto repeat;
	case ASM_KEYW:
	lexstate = ST_ASM;
	count = 0;
	goto repeat;
	case TYPEOF_KEYW:
	lexstate = ST_TYPEOF;
	count = 0;
	goto repeat;

	case STRUCT_KEYW:
	case UNION_KEYW:
	case ENUM_KEYW:
	dont_want_brace_phrase = 3;
	suppress_type_lookup = 2;
	goto fini;

	case EXPORT_SYMBOL_KEYW:
	goto fini;

	case STATIC_ASSERT_KEYW:
	lexstate = ST_STATIC_ASSERT;
	count = 0;
	goto repeat;
	}
	}
	if (!suppress_type_lookup)
	{
	if (find_symbol(yytext, SYM_TYPEDEF, 1))
	token = TYPE;
	}
	}
	break;

	case '[':
	APP;
	lexstate = ST_BRACKET;
	count = 1;
	goto repeat;

	case '{':
	APP;
	if (dont_want_brace_phrase)
	break;
	lexstate = ST_BRACE;
	count = 1;
	goto repeat;

	case '=': case ':':
	APP;
	lexstate = ST_EXPRESSION;
	break;

	default:
	APP;
	break;
	}
	break;

	case ST_ATTRIBUTE:
	APP;
	switch (token)
	{
	case '(':
	++count;
	goto repeat;
	case ')':
	if (--count == 0)
	{
	lexstate = ST_NORMAL;
	token = ATTRIBUTE_PHRASE;
	break;
	}
	goto repeat;
	default:
	goto repeat;
	}
	break;

	case ST_ASM:
	APP;
	switch (token)
	{
	case '(':
	++count;
	goto repeat;
	case ')':
	if (--count == 0)
	{
	lexstate = ST_NORMAL;
	token = ASM_PHRASE;
	break;
	}
	goto repeat;
	default:
	goto repeat;
	}
	break;

	case ST_TYPEOF_1:
	if (token == IDENT)
	{
	if (is_reserved_word(yytext, yyleng) >= 0
	\|\| find_symbol(yytext, SYM_TYPEDEF, 1))
	{
	yyless(0);
	unput('(');
	lexstate = ST_NORMAL;
	token = TYPEOF_KEYW;
	break;
	}
	_APP("(", 1);
	}
	lexstate = ST_TYPEOF;
	/* FALLTHRU */

	case ST_TYPEOF:
	switch (token)
	{
	case '(':
	if ( ++count == 1 )
	lexstate = ST_TYPEOF_1;
	else
	APP;
	goto repeat;
	case ')':
	APP;
	if (--count == 0)
	{
	lexstate = ST_NORMAL;
	token = TYPEOF_PHRASE;
	break;
	}
	goto repeat;
	default:
	APP;
	goto repeat;
	}
	break;

	case ST_BRACKET:
	APP;
	switch (token)
	{
	case '[':
	++count;
	goto repeat;
	case ']':
	if (--count == 0)
	{
	lexstate = ST_NORMAL;
	token = BRACKET_PHRASE;
	break;
	}
	goto repeat;
	default:
	goto repeat;
	}
	break;

	case ST_BRACE:
	APP;
	switch (token)
	{
	case '{':
	++count;
	goto repeat;
	case '}':
	if (--count == 0)
	{
	lexstate = ST_NORMAL;
	token = BRACE_PHRASE;
	break;
	}
	goto repeat;
	default:
	goto repeat;
	}
	break;

	case ST_EXPRESSION:
	switch (token)
	{
	case '(': case '[': case '{':
	++count;
	APP;
	goto repeat;
	case '}':
	/* is this the last line of an enum declaration? */
	if (count == 0)
	{
	/* Put back the token we just read so's we can find it again
	after registering the expression. */
	unput(token);

	lexstate = ST_NORMAL;
	token = EXPRESSION_PHRASE;
	break;
	}
	/* FALLTHRU */
	case ')': case ']':
	--count;
	APP;
	goto repeat;
	case ',': case ';':
	if (count == 0)
	{
	/* Put back the token we just read so's we can find it again
	after registering the expression. */
	unput(token);

	lexstate = ST_NORMAL;
	token = EXPRESSION_PHRASE;
	break;
	}
	APP;
	goto repeat;
	default:
	APP;
	goto repeat;
	}
	break;

	case ST_STATIC_ASSERT:
	APP;
	switch (token)
	{
	case '(':
	++count;
	goto repeat;
	case ')':
	if (--count == 0)
	{
	lexstate = ST_NORMAL;
	token = STATIC_ASSERT_PHRASE;
	break;
	}
	goto repeat;
	default:
	goto repeat;
	}
	break;

	default:
	exit(1);
	}
	fini:

	if (suppress_type_lookup > 0)
	--suppress_type_lookup;
	if (dont_want_brace_phrase > 0)
	--dont_want_brace_phrase;

	yylval = &next_node->next;

	return token;
	}