ack/lang/cem/cpp.ansi/LLlex.c

/*
 * (c) copyright 1987 by the Vrije Universiteit, Amsterdam, The Netherlands.
 * See the copyright notice in the ACK home directory, in the file "Copyright".
 */
/* $Id$ */
/*		    L E X I C A L   A N A L Y Z E R			*/

#include	"parameters.h"

#include	<alloc.h>
#include	"input.h"
#include	"arith.h"
#include	"macro.h"
#include	"idf.h"
#include	"LLlex.h"
#include	"Lpars.h"
#include	"class.h"
#include	"bits.h"

#define	BUFSIZ	1024

struct token dot;

int ReplaceMacros = 1;		/* replacing macros			*/
int AccDefined = 0;		/* accept "defined(...)"		*/
int UnknownIdIsZero = 0;	/* interpret unknown id as integer 0	*/
int Unstacked = 0;		/* an unstack is done			*/
int AccFileSpecifier = 0;	/* return filespecifier <...>		*/
int LexSave = 0;                /* last character read by GetChar       */
extern int InputLevel;		/* # of current macro expansions	*/

extern char	*string_token();
extern arith	char_constant();
#define		FLG_ESEEN	0x01	/* possibly a floating point number */
#define		FLG_DOTSEEN	0x02	/* certainly a floating point number */

void skipcomment();

int
LLlex()
{
	return (DOT != EOF) ? GetToken(&dot) : EOF;
}


int
GetToken(ptok)
	register struct token *ptok;
{
	/*	GetToken() is the actual token recognizer. It calls the
		control line interpreter if it encounters a "\n{w}*#"
		combination. Macro replacement is also performed if it is
		needed.
	*/
	char buf[BUFSIZ];
	register int ch, nch;

again:	/* rescan the input after an error or replacement	*/
	ch = GetChar();
	/* rescan, the following character has been read	*/
	if ((ch & 0200) && ch != EOI) /* stop on non-ascii character */
		fatal("non-ascii '\\%03o' read", ch & 0377);
	/* keep track of the place of the token in the file	*/

	switch (class(ch)) {	/* detect character class	*/
	case STNL:		/* newline, vertical space or formfeed	*/
		LineNumber++;
		return ptok->tk_symb = EOF;
	case STSKIP:		/* just skip the skip characters	*/
		goto again;
	case STGARB:		/* garbage character			*/
garbage:
		if (040 < ch && ch < 0177)
			error("garbage char %c", ch);
		else
			error("garbage char \\%03o", ch);
		goto again;
	case STSIMP:	/* a simple character, no part of compound token*/
		return ptok->tk_symb = ch;
	case STCOMP:	/* maybe the start of a compound token		*/
		nch = GetChar();		/* character lookahead	*/
		switch (ch) {
		case '!':
			if (nch == '=')
				return ptok->tk_symb = NOTEQUAL;
			UnGetChar();
			return ptok->tk_symb = ch;
		case '&':
			if (nch == '&')
				return ptok->tk_symb = AND;
			else if (nch == '=')
				return ptok->tk_symb = ANDAB;
			UnGetChar();
			return ptok->tk_symb = ch;
		case '+':
			if (nch == '+')
				return ptok->tk_symb = PLUSPLUS;
			else if (nch == '=')
				return ptok->tk_symb = PLUSAB;
			UnGetChar();
			return ptok->tk_symb = ch;
		case '-':
			if (nch == '-')
				return ptok->tk_symb = MINMIN;
			else if (nch == '>')
				return ptok->tk_symb = ARROW;
			else if (nch == '=')
				return ptok->tk_symb = MINAB;
			UnGetChar();
			return ptok->tk_symb = ch;
		case '<':
			if (AccFileSpecifier) {
				UnGetChar();	/* pushback nch */
				ptok->tk_str =
					string_token("file specifier", '>');
				return ptok->tk_symb = FILESPECIFIER;
			} else if (nch == '<') {
				if ((nch = GetChar()) == '=')
					return ptok->tk_symb = LEFTAB;
				UnGetChar();
				return ptok->tk_symb = LEFT;
			} else if (nch == '=')
				return ptok->tk_symb = LESSEQ;
			UnGetChar();
			return ptok->tk_symb = ch;
		case '=':
			if (nch == '=')
				return ptok->tk_symb = EQUAL;
			UnGetChar();
			return ptok->tk_symb = ch;
		case '>':
			if (nch == '=')
				return ptok->tk_symb = GREATEREQ;
			else if (nch == '>') {
				if ((nch = GetChar()) == '=')
					return ptok->tk_symb = RIGHTAB;
				UnGetChar();
				return ptok->tk_symb = RIGHT;
			}
			UnGetChar();
			return ptok->tk_symb = ch;
		case '|':
			if (nch == '|')
				return ptok->tk_symb = OR;
			else if (nch == '=')
				return ptok->tk_symb = ORAB;
			UnGetChar();
			return ptok->tk_symb = ch;
		case '%':
			if (nch == '=')
				return ptok->tk_symb = MODAB;
			UnGetChar();
			return ptok->tk_symb = ch;
		case '*':
			if (nch == '=')
				return ptok->tk_symb = TIMESAB;
			UnGetChar();
			return ptok->tk_symb = ch;
		case '^':
			if (nch == '=')
				return ptok->tk_symb = XORAB;
			UnGetChar();
			return ptok->tk_symb = ch;
		case '/':
			if (nch == '*' && !InputLevel) {
				skipcomment();
				goto again;
			}
			else if (nch == '=')
				return ptok->tk_symb = DIVAB;
			UnGetChar();
			return ptok->tk_symb = ch;
		default:
			crash("bad class for char 0%o", ch);
			/* NOTREACHED */
		}
	case STCHAR:				/* character constant	*/
		ptok->tk_val = char_constant("character");
		return ptok->tk_symb = INTEGER;
	case STSTR:					/* string	*/
		ptok->tk_str = string_token("string", '"');
		return ptok->tk_symb = STRING;
	case STELL:		/* wide character constant/string prefix */
		nch = GetChar();
		if (nch == '"') {
			ptok->tk_str =
				string_token("wide character string", '"');
			return ptok->tk_symb = STRING;
		} else if (nch == '\'') {
			ptok->tk_val = char_constant("wide character");
			return ptok->tk_symb = INTEGER;
		}
		UnGetChar();
		/* fallthrough */
	case STIDF:
	{
		extern int idfsize;		/* ??? */
		register char *tg = &buf[0];
		register char *maxpos = &buf[idfsize];
		int NoExpandNext = 0;

#define tstmac(bx)	if (!(bits[ch] & bx)) goto nomac
#define cpy		*tg++ = ch
#define load		(ch = GetChar()); if (!in_idf(ch)) goto endidf

		if (Unstacked) EnableMacros();  /* unstack macro's when allowed. */
		if (ch == NOEXPM)  {
			NoExpandNext = 1;
			ch = GetChar();
		}
#ifdef DOBITS
		cpy; tstmac(bit0); load;
		cpy; tstmac(bit1); load;
		cpy; tstmac(bit2); load;
		cpy; tstmac(bit3); load;
		cpy; tstmac(bit4); load;
		cpy; tstmac(bit5); load;
		cpy; tstmac(bit6); load;
		cpy; tstmac(bit7); load;
#endif
		for(;;) {
			if (tg < maxpos) {
				cpy;

			}
			load;
		}
	endidf:
		/*if (ch != EOI) UnGetChar();*/
		UnGetChar();
		*tg++ = '\0';	/* mark the end of the identifier	*/
		if (ReplaceMacros) {
			register struct idf *idef = findidf(buf);

			if (idef && idef->id_macro && !NoExpandNext) {
				if (replace(idef))
					goto again;
			}
		}

	nomac:			/* buf can already be null-terminated. soit */
		ch = GetChar();
		while (in_idf(ch)) {
			if (tg < maxpos) *tg++ = ch;
			ch = GetChar();
		}
		UnGetChar();
		*tg++ = '\0';   /* mark the end of the identifier       */

		NoExpandNext = 0;
		if (UnknownIdIsZero) {
			ptok->tk_val = (arith)0;
			return ptok->tk_symb = INTEGER;
		}
		ptok->tk_str = Malloc((unsigned)(tg - buf));
		strcpy(ptok->tk_str, buf);
		return IDENTIFIER;
	}
	case STNUM:				/* a numeric constant	*/
	{			/* it may only be an integer constant */
		register int base = 10, vch;
		register arith val = 0;
		int ovfl = 0;
		arith ubound = max_arith/(base/2);

		/* Since the preprocessor only knows integers and has
		 * nothing to do with ellipsis we just return when the
		 * pp-number starts with a '.'
		 */
		if (ch == '.') {
			return ptok->tk_symb = ch;
		}
		if (ch == '0') {
			ch = GetChar();
			if (ch == 'x' || ch == 'X') {
				base = 16;
				ch = GetChar();
			} else {
				base = 8;
			}

		}
		while ((vch = val_in_base(ch, base)) >= 0) {
			if (val < 0 || val > ubound) ovfl++;
			val *= base;
			if (val < 0 && val + vch >= 0) ovfl++;
			val += vch;
			ch = GetChar();
		}
		ptok->tk_unsigned = 0;
		if (ch == 'u' || ch == 'U') {
			ptok->tk_unsigned = 1;
			ch = GetChar();
			if (ch == 'l' || ch == 'L') {
				ch = GetChar();
			}
		}
		else if (ch == 'l' || ch == 'L') {
			ch = GetChar();
			if (ch == 'u' || ch == 'U') {
				ptok->tk_unsigned = 1;
				ch = GetChar();
			}
		}
		if (ovfl) {
			warning("overflow in constant");
			ptok->tk_unsigned = 1;
		}
		else if (val < 0) {
			/* give warning??? */
			ptok->tk_unsigned = 1;
		}
		UnGetChar();
		ptok->tk_val = val;
		return ptok->tk_symb = INTEGER;
	}
	case STEOI:			/* end of text on source file	*/
		return ptok->tk_symb = EOF;
        case STMSPEC:
		if (!InputLevel) goto garbage;
		if (ch == TOKSEP) goto again;
		/* fallthrough shouldn't happen */
	default:				/* this cannot happen	*/
		crash("bad class for char 0%o", ch);
	}
	/*NOTREACHED*/
}

void
skipcomment()
{
	/*	The last character read has been the '*' of '/_*'.  The
		characters, except NL and EOI, between '/_*' and the first
		occurring '*_/' are not interpreted.
		NL only affects the LineNumber.  EOI is not legal.

		Important note: it is not possible to stop skipping comment
		beyond the end-of-file of an included file.
		EOI is returned by LoadChar only on encountering EOF of the
		top-level file...
	*/
	register int c;

	NoUnstack++;
	c = GetChar();
	do {
		while (c != '*') {
			if (class(c) == STNL) {
				++LineNumber;
			} else if (c == EOI) {
				NoUnstack--;
				return;
			}
			c = GetChar();
		} /* last Character seen was '*' */
		c = GetChar();
	} while (c != '/');
	NoUnstack--;
}

arith
char_constant(nm)
	char *nm;
{
	register arith val = 0;
	register int ch;
	int size = 0;

	ch = GetChar();
	if (ch == '\'')
		error("%s constant too short", nm);
	else
	while (ch != '\'') {
		if (ch == '\n') {
			error("newline in %s constant", nm);
			LineNumber++;
			break;
		}
		if (ch == '\\')
			ch = quoted(GetChar());
		if (ch >= 128) ch -= 256;
		if (size < sizeof(arith))
			val |= ch << (8 * size);
		size++;
		ch = GetChar();
	}
	if (size > sizeof(arith))
		error("%s constant too long", nm);
	else if (size > 1)
		strict("%s constant includes more than one character", nm);
	return val;
}

char *
string_token(nm, stop_char)
	char *nm;
{
	register int ch;
	register int str_size;
	register char *str = Malloc((unsigned) (str_size = ISTRSIZE));
	register int pos = 0;

	ch = GetChar();
	while (ch != stop_char) {
		if (ch == '\n') {
			error("newline in %s", nm);
			LineNumber++;
			break;
		}
		if (ch == EOI) {
			error("end-of-file inside %s", nm);
			break;
		}
		if (ch == '\\' && !AccFileSpecifier)
			ch = quoted(GetChar());
		str[pos++] = ch;
		if (pos == str_size)
			str = Realloc(str, (unsigned)(str_size <<= 1));
		ch = GetChar();
	}
	str[pos++] = '\0'; /* for filenames etc. */
	str = Realloc(str, (unsigned)pos);
	return str;
}

int
quoted(ch)
	register int ch;
{
	/*	quoted() replaces an escaped character sequence by the
		character meant.
	*/
	/* first char after backslash already in ch */
	if (!is_oct(ch)) {		/* a quoted char */
		switch (ch) {
		case 'n':
			ch = '\n';
			break;
		case 't':
			ch = '\t';
			break;
		case 'b':
			ch = '\b';
			break;
		case 'r':
			ch = '\r';
			break;
		case 'f':
			ch = '\f';
			break;
		case 'a':		/* alert */
			ch = '\007';
			break;
		case 'v':		/* vertical tab */
			ch = '\013';
			break;
		case 'x':		/* quoted hex */
		{
			register int hex = 0;
			register int vch;

			for (;;) {
				ch = GetChar();
				if (vch = val_in_base(ch, 16), vch == -1)
					break;
				hex = hex * 16 + vch;
			}
			UnGetChar();
			ch = hex;
		}
		}
	} else {				/* a quoted octal */
		register int oct = 0, cnt = 0;

		do {
			oct = oct*8 + (ch-'0');
			ch = GetChar();
		} while (is_oct(ch) && ++cnt < 3);
		UnGetChar();
		ch = oct;
	}
	return ch&0377;
}


int
val_in_base(ch, base)
	register int ch;
{
	switch (base) {
	case 8:
		return (is_dig(ch) && ch < '9') ? ch - '0' : -1;
	case 10:
		return is_dig(ch) ? ch - '0' : -1;
	case 16:
		return is_dig(ch) ? ch - '0'
			: is_hex(ch) ? (ch - 'a' + 10) & 017
			: -1;
	default:
		fatal("(val_in_base) illegal base value %d", base);
		/* NOTREACHED */
	}
}


int
GetChar()
{
	/*	The routines GetChar and trigraph parses the trigraph
		sequences and removes occurences of \\\n.
	*/
	register int ch;

again:
	LoadChar(ch);

	/* possible trigraph sequence */
	if (ch == '?')
		ch = trigraph();

	/* \\\n are removed from the input stream */
	if (ch == '\\') {
		LoadChar(ch);
		if (ch == '\n') {
			++LineNumber;
			goto again;
		}
		PushBack();
		ch = '\\';
	}
	return(LexSave = ch);
}


int
trigraph()
{
	register int ch;

	LoadChar(ch);
	if (ch == '?') {
		LoadChar(ch);
		switch (ch) {		/* its a trigraph */
		case '=':
			ch =  '#';
			return(ch);
		case '(':
			ch = '[';
			return(ch);
		case '/':
			ch = '\\';
			return(ch);
		case ')':
			ch = ']';
			return(ch);
		case '\'':
			ch = '^';
			return(ch);
		case '<':
			ch = '{';
			return(ch);
		case '!':
			ch = '|';
			return(ch);
		case '>':
			ch = '}';
			return(ch);
		case '-':
			ch = '~';
			return(ch);
		}
		PushBack();
	}
	PushBack();
	return('?');
}