flex lexical analyser

From Wikipedia, the free encyclopedia

This article may require cleanup to meet Wikipedia's quality standards.
Please improve this article if you can. (September 2007)

It has been suggested that Flex++ be merged into this article or section. (Discuss)

flex
Developed by	Vern Paxson
Latest release	2.5.35 / February 26, 2008
OS	Unix-like
Genre	Lexical analyzer generator
License	BSD license
Website	flex.sf.net

flex (fast lexical analyzer generator) is a free software alternative to Lex. It is frequently used with the free Bison parser generator. Flex was originally written in C by Vern Paxson around 1987.

The description for flex as given by the flex manual:

"flex is a tool for generating scanners: programs which recognize lexical patterns in text. flex reads the given input files, or its standard input if no file names are given, for a description of a scanner to generate. The description is in the form of pairs of regular expressions and C code, called rules. flex generates as output a C source file, 'lex.yy.c', which defines a routine 'yylex()'. This file is compiled and linked with the '-lfl' library to produce an executable. When the executable is run, it analyzes its input for occurrences of the regular expressions. Whenever it finds one, it executes the corresponding C code..."

A similar lexical scanner for C++ is flex++, which is included as part of the flex package.

Flex is a non-GNU project, but the GNU project developed the manual for Flex.

[edit] Example lexical analyzer

This is an example of a scanner (written in C) for the instructional programming language PL/0.

The symbols recognized are: '+', '-', '*', '/', '=', '(', ')', ',', ';', '.', ':=', '<', '<=', '<>', '>', '>='; numbers: 0-9 {0-9}; identifiers: a-zA-Z {a-zA-Z0-9} and keywords: "begin", "call", "const", "do", "end", "if", "odd", "procedure", "then", "var", "while".

External variables used:

FILE *source                             /* The source file */
int cur_line, cur_col, err_line, err_col /* For error reporting */
int num                                  /* Last number read stored here, for the parser */
char id[]                                /* Last identifier read stored here, for the parser */
Hashtab *keywords                        /* List of keywords */

External routines called:

error(const char msg[])                              /* Report an error */
Hashtab *create_htab(int estimate)                   /* Create a lookup table */
int enter_htab(Hashtab *ht, char name[], void *data) /* Add an entry to a lookup table */
Entry *find_htab(Hashtab *ht, char *s)               /* Find an entry in a lookup table */
void *get_htab_data(Entry *entry)                    /* Returns data from a lookup table */
FILE *fopen(char fn[], char mode[])                  /* Opens a file for reading */
fgetc(FILE *stream)                                  /* Read the next character from a stream */
ungetc(int ch, FILE *stream)                         /* Put-back a character onto a stream */
isdigit(int ch), isalpha(int ch), isalnum(int ch)    /* Character classification */

External types:

Symbol  /* An enumerated type of all the symbols in the PL/0 language */
Hashtab /* Represents a lookup table */
Entry   /* Represents an entry in the lookup table */

Scanning is started by calling init_scan, passing the name of the source file. If the source file is successfully opened, the parser calls getsym repeatedly to return successive symbols from the source file.

The heart of the scanner, getsym, should be straightforward. First, whitespace is skipped. Then the retrieved character is classified. If the character represents a multiple-character symbol, additional processing must be done. Numbers are converted to internal form, and identifiers are checked to see if they represent a keyword.

int read_ch(void) {
  int ch = fgetc(source);
  cur_col++;
  if (ch == '\n') {
    cur_line++;
    cur_col = 0;
  }
  return ch;
}
 
void put_back(int ch) {
  ungetc(ch, source);
  cur_col--;
  if (ch == '\n') cur_line--;
}
 
Symbol getsym(void) {
  int ch;
 
  while ((ch = read_ch()) != EOF && ch <= ' ')
    ;
  err_line = cur_line;
  err_col  = cur_col;
  switch (ch) {
    case EOF: return eof;
    case '+': return plus;
    case '-': return minus;
    case '*': return times;
    case '/': return slash;
    case '=': return eql;
    case '(': return lparen;
    case ')': return rparen;
    case ',': return comma;
    case ';': return semicolon;
    case '.': return period;
    case ':':
      ch = read_ch();
      return (ch == '=') ? becomes : nul;
    case '<':
      ch = read_ch();
      if (ch == '>') return neq;
      if (ch == '=') return leq;
      put_back(ch);
      return lss;
    case '>':
      ch = read_ch();
      if (ch == '=') return geq;
      put_back(ch);
      return gtr;
    default:
      if (isdigit(ch)) {
        num = 0;
        do {  /* no checking for overflow! */
          num = 10 * num + ch - '0';
          ch = read_ch();
        } while ( ch != EOF && isdigit(ch));
        put_back(ch);
        return number;
      }
      if (isalpha(ch)) {
        Entry *entry;
        id_len = 0;
        do {
          if (id_len < MAX_ID) {
            id[id_len] = (char)ch;
            id_len++;
          }
          ch = read_ch();
        } while ( ch != EOF && isalnum(ch));
        id[id_len] = '\0';
        put_back(ch);
        entry = find_htab(keywords, id);
        return entry ? (Symbol)get_htab_data(entry) : ident;
      }
 
      error("getsym: invalid character '%c'", ch);
      return nul;
  }
}
 
int init_scan(const char fn[]) {
  if ((source = fopen(fn, "r")) == NULL) return 0;
  cur_line = 1;
  cur_col = 0;
  keywords = create_htab(11);
  enter_htab(keywords, "begin", beginsym);
  enter_htab(keywords, "call", callsym);
  enter_htab(keywords, "const", constsym);
  enter_htab(keywords, "do", dosym);
  enter_htab(keywords, "end", endsym);
  enter_htab(keywords, "if", ifsym);
  enter_htab(keywords, "odd", oddsym);
  enter_htab(keywords, "procedure", procsym);
  enter_htab(keywords, "then", thensym);
  enter_htab(keywords, "var", varsym);
  enter_htab(keywords, "while", whilesym);
  return 1;
}

Now, contrast the above code with the code needed for a flex generated scanner for the same language:

%{
#include "y.tab.h"
%}
 
digit         [0-9]
letter        [a-zA-Z]
 
%%
"+"                  { return PLUS;       }
"-"                  { return MINUS;      }
"*"                  { return TIMES;      }
"/"                  { return SLASH;      }
"("                  { return LPAREN;     }
")"                  { return RPAREN;     }
";"                  { return SEMICOLON;  }
","                  { return COMMA;      }
"."                  { return PERIOD;     }
":="                 { return BECOMES;    }
"="                  { return EQL;        }
"<>"                 { return NEQ;        }
"<"                  { return LSS;        }
">"                  { return GTR;        }
"<="                 { return LEQ;        }
">="                 { return GEQ;        }
"begin"              { return BEGINSYM;   }
"call"               { return CALLSYM;    }
"const"              { return CONSTSYM;   }
"do"                 { return DOSYM;      }
"end"                { return ENDSYM;     }
"if"                 { return IFSYM;      }
"odd"                { return ODDSYM;     }
"procedure"          { return PROCSYM;    }
"then"               { return THENSYM;    }
"var"                { return VARSYM;     }
"while"              { return WHILESYM;   }
{letter}({letter}|{digit})* {
                       yylval.id = (char *)strdup(yytext);
                       return IDENT;      }
{digit}+             { yylval.num = atoi(yytext);
                       return NUMBER;     }
[ \t\n\r]            /* skip whitespace */
.                    { printf("Unknown character [%c]\n",yytext[0]);
                       return UNKNOWN;    }
%%
 
int yywrap(void){return 1;}

About 50 lines of code for flex versus about 100 lines of hand-written code.