ack/lang/cem/cemcom/LLlex.c
1987-03-20 08:30:31 +00:00

576 lines
13 KiB
C

/*
* (c) copyright 1987 by the Vrije Universiteit, Amsterdam, The Netherlands.
* See the copyright notice in the ACK home directory, in the file "Copyright".
*/
/* $Header$ */
/* L E X I C A L A N A L Y Z E R */
#include <alloc.h>
#include "nofloat.h"
#include "idfsize.h"
#include "numsize.h"
#include "debug.h"
#include "strsize.h"
#include "nopp.h"
#include "input.h"
#include "arith.h"
#include "def.h"
#include "idf.h"
#include "LLlex.h"
#include "Lpars.h"
#include "class.h"
#include "assert.h"
#include "sizes.h"
/* Data about the token yielded */
struct token dot, ahead, aside;
int ReplaceMacros = 1; /* replacing macros */
int EoiForNewline = 0; /* return EOI upon encountering newline */
int PreProcKeys = 0; /* return preprocessor key */
int AccFileSpecifier = 0; /* return filespecifier <...> */
int AccDefined = 0; /* accept "defined(...)" */
int UnknownIdIsZero = 0; /* interpret unknown id as integer 0 */
int SkipEscNewline = 0; /* how to interpret backslash-newline */
int Unstacked = 0; /* an unstack is done */
int File_Inserted = 0; /* a file has just been inserted */
#define MAX_LL_DEPTH 2
static struct token LexStack[MAX_LL_DEPTH];
static LexSP = 0;
/* In PushLex() the actions are taken in order to initialise or
re-initialise the lexical scanner.
E.g. at the invocation of a sub-parser that uses LLlex(), the
state of the current parser should be saved.
*/
PushLex()
{
ASSERT(LexSP < 2);
ASSERT(ASIDE == 0); /* ASIDE = 0; */
GetToken(&ahead);
ahead.tk_line = LineNumber;
ahead.tk_file = FileName;
LexStack[LexSP++] = dot;
}
PopLex()
{
ASSERT(LexSP > 0);
dot = LexStack[--LexSP];
}
int
LLlex()
{
/* LLlex() plays the role of Lexical Analyzer for the C parser.
The look-ahead and putting aside of tokens are taken into
account.
*/
if (ASIDE) { /* a token is put aside */
dot = aside;
ASIDE = 0;
}
else { /* read ahead and return the old one */
dot = ahead;
/* the following test is performed due to the dual
task of LLlex(): it is also called for parsing the
restricted constant expression following a #if or
#elif. The newline character causes EOF to be
returned in this case to stop the LLgen parsing task.
*/
if (DOT != EOI)
GetToken(&ahead);
else
DOT = EOF;
}
/* keep track of the place of the token in the file */
ahead.tk_file = FileName;
ahead.tk_line = LineNumber;
return DOT;
}
char *string_token();
int
GetToken(ptok)
register struct token *ptok;
{
/* GetToken() is the actual token recognizer. It calls the
control line interpreter if it encounters a "\n#"
combination. Macro replacement is also performed if it is
needed.
*/
char buf[(IDFSIZE > NUMSIZE ? IDFSIZE : NUMSIZE) + 1];
register int ch, nch;
if (File_Inserted) {
File_Inserted = 0;
goto firstline;
}
again: /* rescan the input after an error or replacement */
#ifndef NOPP
if (Unstacked) EnableMacros();
#endif
LoadChar(ch);
go_on: /* rescan, the following character has been read */
if ((ch & 0200) && ch != EOI) /* stop on non-ascii character */
fatal("non-ascii '\\%03o' read", ch & 0377);
switch (class(ch)) { /* detect character class */
case STNL: /* newline, vertical space or formfeed */
firstline:
LineNumber++; /* also at vs and ff */
if (EoiForNewline) /* called in control line */
/* a newline in a control line indicates the
end-of-information of the line.
*/
return ptok->tk_symb = EOI;
while (LoadChar(ch), ch == '#') { /* a control line follows */
domacro();
if (File_Inserted) {
File_Inserted = 0;
goto firstline;
}
}
/* We have to loop here, because in
`domacro' the nl, vt or ff is read. The
character following it may again be a `#'.
*/
goto go_on;
case STSKIP: /* just skip the skip characters */
goto again;
case STGARB: /* garbage character */
#ifndef NOPP
if (SkipEscNewline && (ch == '\\')) {
/* a '\\' is allowed in #if/#elif expression */
LoadChar(ch);
if (class(ch) == STNL) { /* vt , ff ? */
++LineNumber;
goto again;
}
PushBack();
ch = '\\';
}
#endif NOPP
if (040 < ch && ch < 0177)
lexerror("garbage char %c", ch);
else
lexerror("garbage char \\%03o", ch);
goto again;
case STSIMP: /* a simple character, no part of compound token*/
if (ch == '/') { /* probably the start of comment */
LoadChar(ch);
if (ch == '*') { /* start of comment */
skipcomment();
goto again;
}
else {
PushBack();
ch = '/'; /* restore ch */
}
}
return ptok->tk_symb = ch;
case STCOMP: /* maybe the start of a compound token */
LoadChar(nch); /* character lookahead */
switch (ch) {
case '!':
if (nch == '=')
return ptok->tk_symb = NOTEQUAL;
PushBack();
return ptok->tk_symb = ch;
case '&':
if (nch == '&')
return ptok->tk_symb = AND;
PushBack();
return ptok->tk_symb = ch;
case '+':
if (nch == '+')
return ptok->tk_symb = PLUSPLUS;
PushBack();
return ptok->tk_symb = ch;
case '-':
if (nch == '-')
return ptok->tk_symb = MINMIN;
if (nch == '>')
return ptok->tk_symb = ARROW;
PushBack();
return ptok->tk_symb = ch;
case '<':
if (AccFileSpecifier) {
PushBack(); /* pushback nch */
ptok->tk_bts = string_token("file specifier",
'>', &(ptok->tk_len));
return ptok->tk_symb = FILESPECIFIER;
}
if (nch == '<')
return ptok->tk_symb = LEFT;
if (nch == '=')
return ptok->tk_symb = LESSEQ;
PushBack();
return ptok->tk_symb = ch;
case '=':
if (nch == '=')
return ptok->tk_symb = EQUAL;
/* The following piece of code tries to recognise
old-fashioned assignment operators `=op'
*/
switch (nch) {
case '+':
return ptok->tk_symb = PLUSAB;
case '-':
return ptok->tk_symb = MINAB;
case '*':
return ptok->tk_symb = TIMESAB;
case '/':
return ptok->tk_symb = DIVAB;
case '%':
return ptok->tk_symb = MODAB;
case '>':
case '<':
LoadChar(ch);
if (ch != nch) {
PushBack();
lexerror("illegal combination '=%c'",
nch);
}
return ptok->tk_symb =
nch == '<' ? LEFTAB : RIGHTAB;
case '&':
return ptok->tk_symb = ANDAB;
case '^':
return ptok->tk_symb = XORAB;
case '|':
return ptok->tk_symb = ORAB;
}
PushBack();
return ptok->tk_symb = ch;
case '>':
if (nch == '=')
return ptok->tk_symb = GREATEREQ;
if (nch == '>')
return ptok->tk_symb = RIGHT;
PushBack();
return ptok->tk_symb = ch;
case '|':
if (nch == '|')
return ptok->tk_symb = OR;
PushBack();
return ptok->tk_symb = ch;
}
case STIDF:
{
register char *tg = &buf[0];
register int pos = -1;
register int hash;
register struct idf *idef;
extern int idfsize; /* ??? */
hash = STARTHASH();
do { /* read the identifier */
if (++pos < idfsize) {
#ifndef NOPP
if (Unstacked) EnableMacros();
#endif
*tg++ = ch;
hash = ENHASH(hash, ch, pos);
}
LoadChar(ch);
} while (in_idf(ch));
hash = STOPHASH(hash);
if (ch != EOI)
PushBack();
*tg++ = '\0'; /* mark the end of the identifier */
idef = ptok->tk_idf = idf_hashed(buf, tg - buf, hash);
#ifndef NOPP
if (idef->id_macro && ReplaceMacros && replace(idef))
/* macro replacement should be performed */
goto again;
if (UnknownIdIsZero && idef->id_reserved != SIZEOF) {
ptok->tk_ival = (arith)0;
ptok->tk_fund = INT;
return ptok->tk_symb = INTEGER;
}
#endif NOPP
ptok->tk_symb = (
idef->id_reserved ? idef->id_reserved
: idef->id_def && idef->id_def->df_sc == TYPEDEF ?
TYPE_IDENTIFIER
: IDENTIFIER
);
return IDENTIFIER;
}
case STCHAR: /* character constant */
{
register arith val = 0, size = 0;
LoadChar(ch);
if (ch == '\'')
lexerror("character constant too short");
else
while (ch != '\'') {
if (ch == '\n') {
lexerror("newline in character constant");
LineNumber++;
break;
}
if (ch == '\\') {
LoadChar(ch);
if (ch == '\n')
LineNumber++;
ch = quoted(ch);
}
if (ch >= 128) ch -= 256;
val = val*256 + ch;
size++;
LoadChar(ch);
}
if (size > int_size)
lexerror("character constant too long");
ptok->tk_ival = val;
ptok->tk_fund = INT;
return ptok->tk_symb = INTEGER;
}
case STSTR: /* string */
ptok->tk_bts = string_token("string", '"', &(ptok->tk_len));
return ptok->tk_symb = STRING;
case STNUM: /* a numeric constant */
{
/* It should be noted that 099 means 81(decimal) and
099.5 means 99.5 . This severely limits the tricks
we can use to scan a numeric value.
*/
register char *np = &buf[1];
register int base = 10;
register int vch;
register arith val = 0;
if (ch == '.') { /* an embarrassing ambiguity */
#ifndef NOFLOAT
LoadChar(vch);
PushBack();
if (!is_dig(vch)) /* just a `.' */
return ptok->tk_symb = ch;
*np++ = '0';
/* in the rest of the compiler, all floats
have to start with a digit.
*/
#else NOFLOAT
return ptok->tk_symb = ch;
#endif NOFLOAT
}
if (ch == '0') {
*np++ = ch;
LoadChar(ch);
if (ch == 'x' || ch == 'X') {
base = 16;
LoadChar(ch);
}
else
base = 8;
}
while (vch = val_in_base(ch, base), vch >= 0) {
val = val*base + vch;
if (np < &buf[NUMSIZE])
*np++ = ch;
LoadChar(ch);
}
if (ch == 'l' || ch == 'L') {
ptok->tk_ival = val;
ptok->tk_fund = LONG;
return ptok->tk_symb = INTEGER;
}
#ifndef NOFLOAT
if (base == 16 || !(ch == '.' || ch == 'e' || ch == 'E'))
#endif NOFLOAT
{
PushBack();
ptok->tk_ival = val;
/* The semantic analyser must know if the
integral constant is given in octal/hexa-
decimal form, in which case its type is
UNSIGNED, or in decimal form, in which case
its type is signed, indicated by
the fund INTEGER.
*/
ptok->tk_fund =
(base == 10 || (base == 8 && val == (arith)0))
? INTEGER : UNSIGNED;
return ptok->tk_symb = INTEGER;
}
/* where's the test for the length of the integral ??? */
#ifndef NOFLOAT
if (ch == '.'){
if (np < &buf[NUMSIZE])
*np++ = ch;
LoadChar(ch);
}
while (is_dig(ch)){
if (np < &buf[NUMSIZE])
*np++ = ch;
LoadChar(ch);
}
if (ch == 'e' || ch == 'E') {
if (np < &buf[NUMSIZE])
*np++ = ch;
LoadChar(ch);
if (ch == '+' || ch == '-') {
if (np < &buf[NUMSIZE])
*np++ = ch;
LoadChar(ch);
}
if (!is_dig(ch)) {
lexerror("malformed floating constant");
if (np < &buf[NUMSIZE])
*np++ = ch;
}
while (is_dig(ch)) {
if (np < &buf[NUMSIZE])
*np++ = ch;
LoadChar(ch);
}
}
PushBack();
*np++ = '\0';
buf[0] = '-'; /* good heavens... */
if (np == &buf[NUMSIZE+1]) {
lexerror("floating constant too long");
ptok->tk_fval = Salloc("0.0",(unsigned) 5) + 1;
}
else
ptok->tk_fval = Salloc(buf,(unsigned) (np - buf)) + 1;
return ptok->tk_symb = FLOATING;
#endif NOFLOAT
}
case STEOI: /* end of text on source file */
return ptok->tk_symb = EOI;
default: /* this cannot happen */
crash("bad class for char 0%o", ch);
}
/*NOTREACHED*/
}
skipcomment()
{
/* The last character read has been the '*' of '/_*'. The
characters, except NL and EOI, between '/_*' and the first
occurring '*_/' are not interpreted.
NL only affects the LineNumber. EOI is not legal.
Important note: it is not possible to stop skipping comment
beyond the end-of-file of an included file.
EOI is returned by LoadChar only on encountering EOF of the
top-level file...
*/
register int c;
NoUnstack++;
LoadChar(c);
do {
while (c != '*') {
if (class(c) == STNL)
++LineNumber;
else
if (c == EOI) {
NoUnstack--;
return;
}
LoadChar(c);
} /* last Character seen was '*' */
LoadChar(c);
} while (c != '/');
NoUnstack--;
}
char *
string_token(nm, stop_char, plen)
char *nm;
int *plen;
{
register int ch;
register int str_size;
register char *str = Malloc((unsigned) (str_size = ISTRSIZE));
register int pos = 0;
LoadChar(ch);
while (ch != stop_char) {
if (ch == '\n') {
lexerror("newline in %s", nm);
LineNumber++;
break;
}
if (ch == EOI) {
lexerror("end-of-file inside %s", nm);
break;
}
if (ch == '\\') {
LoadChar(ch);
if (ch == '\n') {
LineNumber++;
LoadChar(ch);
continue;
}
ch = quoted(ch);
}
str[pos++] = ch;
if (pos == str_size)
str = Srealloc(str, (unsigned) (str_size += RSTRSIZE));
LoadChar(ch);
}
str[pos++] = '\0'; /* for filenames etc. */
*plen = pos;
return str;
}
int
quoted(ch)
register int ch;
{
/* quoted() replaces an escaped character sequence by the
character meant.
*/
/* first char after backslash already in ch */
if (!is_oct(ch)) { /* a quoted char */
switch (ch) {
case 'n':
ch = '\n';
break;
case 't':
ch = '\t';
break;
case 'b':
ch = '\b';
break;
case 'r':
ch = '\r';
break;
case 'f':
ch = '\f';
break;
}
}
else { /* a quoted octal */
register int oct = 0, cnt = 0;
do {
oct = oct*8 + (ch-'0');
LoadChar(ch);
} while (is_oct(ch) && ++cnt < 3);
PushBack();
ch = oct;
}
return ch&0377;
}
/* provisional */
int
val_in_base(ch, base)
register int ch;
{
return
is_dig(ch) ? ch - '0'
: base != 16 ? -1
: is_hex(ch) ? (ch - 'a' + 10) & 017
: -1;
}