ack/lang/m2/m2mm/LLlex.c

/*
 * (c) copyright 1987 by the Vrije Universiteit, Amsterdam, The Netherlands.
 * See the copyright notice in the ACK home directory, in the file "Copyright".
 *
 * Author: Ceriel J.H. Jacobs
 */

/* L E X I C A L   A N A L Y S E R   F O R   M O D U L A - 2 */

/* $Header$ */

#include	<alloc.h>
#include	"idfsize.h"
#include	"idf.h"
#include	"LLlex.h"
#include	"input.h"
#include	"f_info.h"
#include	"Lpars.h"
#include	"class.h"

struct token	dot,
		aside;
int		idfsize = IDFSIZE;
int		ForeignFlag;

static int	eofseen;

STATIC
SkipComment()
{
	/*	Skip Modula-2 comments (* ... *).
		Note that comments may be nested (par. 3.5).
	*/
	register int ch;
	register int CommentLevel = 0;

	LoadChar(ch);
	if (ch == '$') {
		LoadChar(ch);
		switch(ch) {
		case 'F':
			/* Foreign; This definition module has an
			   implementation in another language.
			   In this case, check that the object file is present
			   and don't generate a rule for it.
			*/
			ForeignFlag = 1;
			break;
		default:
			PushBack();
			break;
		}
	}
	for (;;) {
		if (class(ch) == STNL) {
			LineNumber++;
		}
		else if (ch == '(') {
			LoadChar(ch);
			if (ch == '*') CommentLevel++;
			else continue;
		}
		else if (ch == '*') {
			LoadChar(ch);
			if (ch == ')') {
				CommentLevel--;
				if (CommentLevel < 0) break;
			}
			else continue;
		}
		else if (ch == EOI) {
			lexerror("unterminated comment");
			break;
		}
		LoadChar(ch);
	}
}

STATIC
GetString(upto)
{
	/*	Read a Modula-2 string, delimited by the character "upto".
	*/
	register int ch;

	while (LoadChar(ch), ch != upto)	{
		if (class(ch) == STNL)	{
			lexerror("newline in string");
			LineNumber++;
			break;
		}
		if (ch == EOI)	{
			lexerror("end-of-file in string");
			break;
		}
	}
}

static char *s_error = "illegal line directive";

STATIC int
getch()
{
	register int ch;

	for (;;) {
		LoadChar(ch);
		if ((ch & 0200) && ch != EOI) {
			error("non-ascii '\\%03o' read", ch & 0377);
			continue;
		}
		break;
	}
	if (ch == EOI) {
		eofseen = 1;
		return '\n';
	}
	return ch;
}

CheckForLineDirective()
{
	register int ch = getch();
	register int	i = 0;
	char		buf[IDFSIZE + 2];
	register char	*c = buf;


	if (ch != '#') {
		PushBack();
		return;
	}
	do {	/*
		 * Skip to next digit
		 * Do not skip newlines
		 */
		ch = getch();
		if (class(ch) == STNL) {
			LineNumber++;
			error(s_error);
			return;
		}
	} while (class(ch) != STNUM);
	while (class(ch) == STNUM)  {
		i = i*10 + (ch - '0');
		ch = getch();
	}
	while (ch != '"' && class(ch) != STNL) ch = getch();
	if (ch == '"') {
		c = buf;
		do {
			*c++ = ch = getch();
			if (class(ch) == STNL) {
				LineNumber++;
				error(s_error);
				return;
			}
		} while (ch != '"');
		*--c = '\0';
		do {
			ch = getch();
		} while (class(ch) != STNL);
		/*
		 * Remember the file name
		 */
		if (!eofseen && strcmp(FileName,buf)) {
			FileName = Salloc(buf,(unsigned) strlen(buf) + 1);
		}
	}
	if (eofseen) {
		error(s_error);
		return;
	}
	LineNumber = i;
}

char idfbuf[IDFSIZE + 2];

int
LLlex()
{
	/*	LLlex() is the Lexical Analyzer.
		The putting aside of tokens is taken into account.
	*/
	register struct token *tk = &dot;
	register int ch, nch;

	if (ASIDE)	{	/* a token is put aside		*/
		*tk = aside;
		ASIDE = 0;
		return tk->tk_symb;
	}

again1:
	if (eofseen) {
		eofseen = 0;
		ch = EOI;
	}
	else {
again:
		LoadChar(ch);
		if ((ch & 0200) && ch != EOI) {
			error("non-ascii '\\%03o' read", ch & 0377);
			goto again;
		}
	}

	tk->tk_lineno = LineNumber;

	switch (class(ch))	{

	case STNL:
		LineNumber++;
		CheckForLineDirective();
		goto again1;

	case STSKIP:
		goto again;

	case STGARB:
		if ((unsigned) ch - 040 < 0137)	{
			lexerror("garbage char %c", ch);
		}
		else	lexerror("garbage char \\%03o", ch);
		goto again;

	case STSIMP:
		if (ch == '(')	{
			LoadChar(nch);
			if (nch == '*')	{
				SkipComment();
				goto again;
			}
			else if (nch == EOI) eofseen = 1;
			else PushBack();
		}
		if (ch == '&') return tk->tk_symb = AND;
		if (ch == '~') return tk->tk_symb = NOT;
		return tk->tk_symb = ch;

	case STCOMP:
		LoadChar(nch);
		switch (ch)	{

		case '.':
			if (nch == '.')	{
				return tk->tk_symb = UPTO;
			}
			break;

		case ':':
			if (nch == '=')	{
				return tk->tk_symb = BECOMES;
			}
			break;

		case '<':
			if (nch == '=')	{
				return tk->tk_symb = LESSEQUAL;
			}
			if (nch == '>') {
				return tk->tk_symb = '#';
			}
			break;

		case '>':
			if (nch == '=')	{
				return tk->tk_symb = GREATEREQUAL;
			}
			break;

		default :
			crash("(LLlex, STCOMP)");
		}
		if (nch == EOI) eofseen = 1;
		else PushBack();
		return tk->tk_symb = ch;

	case STIDF:
	{
		register char *tag = &idfbuf[0];
		register struct idf *id;

		do	{
			if (tag - idfbuf < idfsize) *tag++ = ch;
			LoadChar(ch);
		} while(in_idf(ch));

		if (ch == EOI) eofseen = 1;
		else PushBack();
		*tag++ = '\0';

		tk->TOK_IDF = id = findidf(idfbuf);
		return tk->tk_symb = id && id->id_reserved ? id->id_reserved : IDENT;
	}

	case STSTR:
		GetString(ch);
		return tk->tk_symb = STRING;

	case STNUM:
	{
		/*	The problem arising with the "parsing" of a number
			is that we don't know the base in advance so we
			have to read the number with the help of a rather
			complex finite automaton.
		*/
		enum statetp {Oct,OptHex,Hex,Dec,OctEndOrHex,End,OptReal,Real};
		register enum statetp state;
		state = is_oct(ch) ? Oct : Dec;
		LoadChar(ch);
		for (;;) {
			switch(state) {
			case Oct:
				while (is_oct(ch))	{
					LoadChar(ch);
				}
				if (ch == 'B' || ch == 'C') {
					state = OctEndOrHex;
					break;
				}
				/* Fall Through */
			case Dec:
				while (is_dig(ch))	{
					LoadChar(ch);
				}
				if (ch == 'D') state = OptHex;
				else if (is_hex(ch)) state = Hex;
				else if (ch == '.') state = OptReal;
				else {
					state = End;
					if (ch == 'H') ;
					else if (ch == EOI) eofseen = 1;
					else PushBack();
				}
				break;

			case OptHex:
				LoadChar(ch);
				if (is_hex(ch)) {
					state = Hex;
				}
				else	state = End;
				break;

			case Hex:
				while (is_hex(ch))	{
					LoadChar(ch);
				}
				state = End;
				if (ch != 'H') {
					lexerror("H expected after hex number");
					if (ch == EOI) eofseen = 1;
					else PushBack();
				}
				break;

			case OctEndOrHex:
				LoadChar(ch);
				if (ch == 'H') {
					state = End;
					break;
				}
				if (is_hex(ch)) {
					state = Hex;
					break;
				}
				if (ch == EOI) eofseen = 1;
				else PushBack();
				/* Fall through */

			case End:
				return tk->tk_symb = INTEGER;

			case OptReal:
				/*	The '.' could be the first of the '..'
					token. At this point, we need a
					look-ahead of two characters.
				*/
				LoadChar(ch);
				if (ch == '.') {
					/*	Indeed the '..' token
					*/
					PushBack();
					PushBack();
					state = End;
					break;
				}
				state = Real;
				break;
			}
			if (state == Real) break;
		}

		while (is_dig(ch)) {
			/* 	Fractional part
			*/
			LoadChar(ch);
		}

		if (ch == 'E' || ch == 'D') {
			/*	Scale factor
			*/
			if (ch == 'D') {
				LoadChar(ch);
				if (!(ch == '+' || ch == '-' || is_dig(ch)))
					goto noscale;
			}
			LoadChar(ch);
			if (ch == '+' || ch == '-') {
				/*	Signed scalefactor
				*/
				LoadChar(ch);
			}
			if (is_dig(ch)) {
				do {
					LoadChar(ch);
				} while (is_dig(ch));
			}
			else {
				lexerror("bad scale factor");
			}
		}

noscale:
		if (ch == EOI) eofseen = 1;
		else PushBack();

		return tk->tk_symb = REAL;

		/*NOTREACHED*/
	}

	case STEOI:
		return tk->tk_symb = -1;

	case STCHAR:
	default:
		crash("(LLlex) Impossible character class");
		/*NOTREACHED*/
	}
	/*NOTREACHED*/
}