ack/lang/m2/comp/LLlex.c

/*	LEXICAL ANALYSER FOR MODULA-2	*/

#include "input.h"
#include <alloc.h>
#include "f_info.h"
#include "Lpars.h"
#include "class.h"
#include "param.h"
#include "idf.h"
#include "LLlex.h"

long str2long();

struct token dot, aside;

static char *RcsId = "$Header$";

/*	Skip Modula-2 like comment (* ... *).
	Note that comment may be nested.
*/
static
SkipComment()
{
	register int ch;
	register int NestLevel = 0;

	LoadChar(ch);
	for (;;) {
		if (class(ch) == STNL) {
			LineNumber++;
		}
		else
		if (ch == '(') {
			LoadChar(ch);
			if (ch == '*') {
				++NestLevel;
			}
			else {
				continue;
			}
		}
		else
		if (ch == '*') {
			LoadChar(ch);
			if (ch == ')') {
				if (NestLevel-- == 0) {
					return;
				}
			}
			else {
				continue;
			}
		}
		LoadChar(ch);
	}
}

static char *
GetString(upto)
{
	register int ch;
	int str_size;
	char *str = Malloc(str_size = 32);
	register int pos = 0;
	
	LoadChar(ch);
	while (ch != upto)	{
		if (class(ch) == STNL)	{
			lexerror("newline in string");
			LineNumber++;
			break;
		}
		if (ch == EOI) {
			lexerror("end-of-file in string");
			break;
		}
		str[pos++] = ch;
		if (pos == str_size)	{
			str = Srealloc(str, str_size += 8);
		}
		LoadChar(ch);
	}
	str[pos] = '\0';
	return str;
}

/*	LLlex() plays the role of Lexical Analyzer for the parser.
	The putting aside of tokens is taken into account.
*/
int
LLlex()
{
	register struct token *tk = &dot;
	char buf[(IDFSIZE > NUMSIZE ? IDFSIZE : NUMSIZE) + 1];
	register int ch, nch;

	if (ASIDE)	{	/* a token is put aside		*/
		*tk = aside;
		ASIDE = 0;
		return tk->tk_symb;
	}
	tk->tk_lineno = LineNumber;

again:
	LoadChar(ch);
	if ((ch & 0200) && ch != EOI) {
		fatal("non-ascii '\\%03o' read", ch & 0377);
	}
	
	switch (class(ch))	{

	case STSKIP:
		goto again;

	case STNL:
		LineNumber++;
		tk->tk_lineno++;
		goto again;

	case STGARB:
		if (040 < ch && ch < 0177)	{
			lexerror("garbage char %c", ch);
		}
		else	{
			lexerror("garbage char \\%03o", ch);
		}
		goto again;

	case STSIMP:
		if (ch == '(')	{
			LoadChar(nch);
			if (nch == '*')	{
				SkipComment();
				goto again;
			}
			else	{
				PushBack(nch);
			}
		}
		return tk->tk_symb = ch;

	case STCOMP:
		LoadChar(nch);
		switch (ch)	{

		case '.':
			if (nch == '.')	{
				return tk->tk_symb = UPTO;
			}
			PushBack(nch);
			return tk->tk_symb = ch;

		case ':':
			if (nch == '=')	{
				return tk->tk_symb = BECOMES;
			}
			PushBack(nch);
			return tk->tk_symb = ch;

		case '<':
			if (nch == '=')	{
				return tk->tk_symb = LESSEQUAL;
			}
			else
			if (nch == '>') {
				return tk->tk_symb = UNEQUAL;
			}
			PushBack(nch);
			return tk->tk_symb = ch;

		case '>':
			if (nch == '=')	{
				return tk->tk_symb = GREATEREQUAL;
			}
			PushBack(nch);
			return tk->tk_symb = ch;

		default :
			crash("bad STCOMP");
		}

	case STIDF:
	{
		register char *tg = &buf[0];
		register struct idf *id;

		do	{
			if (tg - buf < IDFSIZE) *tg++ = ch;
			LoadChar(ch);
		} while(in_idf(ch));

		if (ch != EOI)
			PushBack(ch);
		*tg++ = '\0';

		id = tk->TOK_IDF = str2idf(buf, 1);
		if (!id) fatal("Out of memory");
		return tk->tk_symb = id->id_reserved ? id->id_reserved : IDENT;
	}

	case STSTR:
		tk->TOK_STR = GetString(ch);
		return tk->tk_symb = STRING;

	case STNUM:
	{
		/*	The problem arising with the "parsing" of a number
			is that we don't know the base in advance so we
			have to read the number with the help of a rather
			complex finite automaton.
			Excuses for the very ugly code!
		*/
		register char *np = &buf[1];
					/* allow a '-' to be added	*/

		*np++ = ch;
		
		LoadChar(ch);
		while (is_oct(ch))	{
			if (np < &buf[NUMSIZE]) {
				*np++ = ch;
			}
			LoadChar(ch);
		}
		switch (ch) {
		case 'H':
Shex:			*np++ = '\0';
			/* Type is integer */
			tk->TOK_INT = str2long(&buf[1], 16);
			return tk->tk_symb = INTEGER;

		case '8':
		case '9':
			do {
				if (np < &buf[NUMSIZE]) {
					*np++ = ch;
				}
				LoadChar(ch);
			} while (is_dig(ch));

			if (is_hex(ch))
				goto S2;
			if (ch == 'H')
				goto Shex;
			if (ch == '.')
				goto Sreal;
			PushBack(ch);
			goto Sdec;

		case 'B':
		case 'C':
			if (np < &buf[NUMSIZE]) {
				*np++ = ch;
			}
			LoadChar(ch);
			if (ch == 'H')
				goto Shex;
			if (is_hex(ch))
				goto S2;
			PushBack(ch);
			ch = *--np;
			*np++ = '\0';
			/*
			 * If (ch == 'C') type is a CHAR
			 * else type is an INTEGER
			 */
			tk->TOK_INT = str2long(&buf[1], 8);
			return tk->tk_symb = INTEGER;

		case 'A':
		case 'D':
		case 'E':
		case 'F':
S2:
			do {
				if (np < &buf[NUMSIZE]) {
					*np++ = ch;
				}
				LoadChar(ch);
			} while (is_hex(ch));
			if (ch != 'H') {
				lexerror("H expected after hex number");
				PushBack(ch);
			}
			goto Shex;

		case '.':
Sreal:
			/*	This '.' could be the first of the '..'
				token. At this point, we need a look-ahead
				of two characters.
			*/
			LoadChar(ch);
			if (ch == '.') {
				/*	Indeed the '..' token
				*/
				PushBack(ch);
				PushBack(ch);
				goto Sdec;
			}

			/* a real constant */
			if (np < &buf[NUMSIZE]) {
				*np++ = '.';
			}

			if (is_dig(ch)) {
				/* 	Fractional part
				*/
				do {
					if (np < &buf[NUMSIZE]) {
						*np++ = ch;
					}
					LoadChar(ch);
				} while (is_dig(ch));
			}
			
			if (ch == 'E') {
				/*	Scale factor
				*/
				if (np < &buf[NUMSIZE]) {
					*np++ = 'E';
				}
				LoadChar(ch);
				if (ch == '+' || ch == '-') {
					/*	Signed scalefactor
					*/
					if (np < &buf[NUMSIZE]) {
						*np++ = ch;
					}
					LoadChar(ch);
				}
				if (is_dig(ch)) {
					do {
						if (np < &buf[NUMSIZE]) {
							*np++ = ch;
						}
						LoadChar(ch);
					} while (is_dig(ch));
				}
				else {
					lexerror("bad scale factor");
				}
			}

			PushBack(ch);

			if (np == &buf[NUMSIZE + 1]) {
				lexerror("floating constant too long");
				tk->TOK_REL = Salloc("0.0", 5);
			}
			else {
				tk->TOK_REL = Salloc(buf, np - buf) + 1;
			}
			return tk->tk_symb = REAL;

		default:
			PushBack(ch);
Sdec:
			*np++ = '\0';
			/* Type is an integer */
			tk->TOK_INT = str2long(&buf[1], 10);
			return tk->tk_symb = INTEGER;
		}
		/*NOTREACHED*/
	}

	case STEOI:
		return tk->tk_symb = -1;

	case STCHAR:
	default:
		crash("bad character class %d", class(ch));
	}
	/*NOTREACHED*/
}
Initial version 1986-03-20 14:52:03 +00:00			`/* LEXICAL ANALYSER FOR MODULA-2 */`

			`#include "input.h"`
			`#include <alloc.h>`
			`#include "f_info.h"`
			`#include "Lpars.h"`
			`#include "class.h"`
			`#include "param.h"`
			`#include "idf.h"`
			`#include "LLlex.h"`

			`long str2long();`

			`struct token dot, aside;`

			`static char *RcsId = "$Header$";`

some improvements 1986-03-24 17:29:57 +00:00			`/* Skip Modula-2 like comment (* ... *).`
			`Note that comment may be nested.`
			`*/`
			`static`
			`SkipComment()`
Initial version 1986-03-20 14:52:03 +00:00			`{`
some improvements 1986-03-24 17:29:57 +00:00			`register int ch;`
			`register int NestLevel = 0;`

			`LoadChar(ch);`
			`for (;;) {`
			`if (class(ch) == STNL) {`
			`LineNumber++;`
			`}`
			`else`
			`if (ch == '(') {`
			`LoadChar(ch);`
			`if (ch == '*') {`
			`++NestLevel;`
			`}`
			`else {`
			`continue;`
			`}`
			`}`
			`else`
			`if (ch == '*') {`
			`LoadChar(ch);`
			`if (ch == ')') {`
			`if (NestLevel-- == 0) {`
			`return;`
			`}`
			`}`
			`else {`
			`continue;`
			`}`
			`}`
			`LoadChar(ch);`
Initial version 1986-03-20 14:52:03 +00:00			`}`
some improvements 1986-03-24 17:29:57 +00:00			`}`
Initial version 1986-03-20 14:52:03 +00:00
some improvements 1986-03-24 17:29:57 +00:00			`static char *`
			`GetString(upto)`
			`{`
			`register int ch;`
			`int str_size;`
			`char *str = Malloc(str_size = 32);`
			`register int pos = 0;`

			`LoadChar(ch);`
			`while (ch != upto) {`
			`if (class(ch) == STNL) {`
			`lexerror("newline in string");`
			`LineNumber++;`
			`break;`
			`}`
			`if (ch == EOI) {`
			`lexerror("end-of-file in string");`
			`break;`
			`}`
			`str[pos++] = ch;`
			`if (pos == str_size) {`
			`str = Srealloc(str, str_size += 8);`
			`}`
			`LoadChar(ch);`
			`}`
			`str[pos] = '\0';`
			`return str;`
Initial version 1986-03-20 14:52:03 +00:00			`}`

some improvements 1986-03-24 17:29:57 +00:00			`/* LLlex() plays the role of Lexical Analyzer for the parser.`
			`The putting aside of tokens is taken into account.`
			`*/`
Initial version 1986-03-20 14:52:03 +00:00			`int`
some improvements 1986-03-24 17:29:57 +00:00			`LLlex()`
Initial version 1986-03-20 14:52:03 +00:00			`{`
some improvements 1986-03-24 17:29:57 +00:00			`register struct token *tk = &dot;`
Initial version 1986-03-20 14:52:03 +00:00			`char buf[(IDFSIZE > NUMSIZE ? IDFSIZE : NUMSIZE) + 1];`
			`register int ch, nch;`

some improvements 1986-03-24 17:29:57 +00:00			`if (ASIDE) { /* a token is put aside */`
			`*tk = aside;`
			`ASIDE = 0;`
			`return tk->tk_symb;`
			`}`
			`tk->tk_lineno = LineNumber;`

Initial version 1986-03-20 14:52:03 +00:00			`again:`
			`LoadChar(ch);`
			`if ((ch & 0200) && ch != EOI) {`
			`fatal("non-ascii '\\%03o' read", ch & 0377);`
			`}`

			`switch (class(ch)) {`

			`case STSKIP:`
			`goto again;`

			`case STNL:`
			`LineNumber++;`
some improvements 1986-03-24 17:29:57 +00:00			`tk->tk_lineno++;`
Initial version 1986-03-20 14:52:03 +00:00			`goto again;`

			`case STGARB:`
			`if (040 < ch && ch < 0177) {`
			`lexerror("garbage char %c", ch);`
			`}`
			`else {`
			`lexerror("garbage char \\%03o", ch);`
			`}`
			`goto again;`

			`case STSIMP:`
			`if (ch == '(') {`
			`LoadChar(nch);`
			`if (nch == '*') {`
			`SkipComment();`
			`goto again;`
			`}`
			`else {`
			`PushBack(nch);`
			`}`
			`}`
			`return tk->tk_symb = ch;`

			`case STCOMP:`
			`LoadChar(nch);`
			`switch (ch) {`

			`case '.':`
			`if (nch == '.') {`
			`return tk->tk_symb = UPTO;`
			`}`
			`PushBack(nch);`
			`return tk->tk_symb = ch;`

			`case ':':`
			`if (nch == '=') {`
			`return tk->tk_symb = BECOMES;`
			`}`
			`PushBack(nch);`
			`return tk->tk_symb = ch;`

			`case '<':`
			`if (nch == '=') {`
			`return tk->tk_symb = LESSEQUAL;`
			`}`
			`else`
			`if (nch == '>') {`
			`return tk->tk_symb = UNEQUAL;`
			`}`
			`PushBack(nch);`
			`return tk->tk_symb = ch;`

			`case '>':`
			`if (nch == '=') {`
			`return tk->tk_symb = GREATEREQUAL;`
			`}`
			`PushBack(nch);`
			`return tk->tk_symb = ch;`

			`default :`
			`crash("bad STCOMP");`
			`}`

			`case STIDF:`
			`{`
			`register char *tg = &buf[0];`
			`register struct idf *id;`

			`do {`
			`if (tg - buf < IDFSIZE) *tg++ = ch;`
			`LoadChar(ch);`
			`} while(in_idf(ch));`

			`if (ch != EOI)`
			`PushBack(ch);`
			`*tg++ = '\0';`

			`id = tk->TOK_IDF = str2idf(buf, 1);`
			`if (!id) fatal("Out of memory");`
			`return tk->tk_symb = id->id_reserved ? id->id_reserved : IDENT;`
			`}`

			`case STSTR:`
			`tk->TOK_STR = GetString(ch);`
			`return tk->tk_symb = STRING;`

			`case STNUM:`
			`{`
			`/* The problem arising with the "parsing" of a number`
			`is that we don't know the base in advance so we`
			`have to read the number with the help of a rather`
			`complex finite automaton.`
			`Excuses for the very ugly code!`
			`*/`
			`register char *np = &buf[1];`
			`/* allow a '-' to be added */`

			`*np++ = ch;`

			`LoadChar(ch);`
			`while (is_oct(ch)) {`
			`if (np < &buf[NUMSIZE]) {`
			`*np++ = ch;`
			`}`
			`LoadChar(ch);`
			`}`
			`switch (ch) {`
			`case 'H':`
			`Shex: *np++ = '\0';`
			`/* Type is integer */`
			`tk->TOK_INT = str2long(&buf[1], 16);`
			`return tk->tk_symb = INTEGER;`

			`case '8':`
			`case '9':`
			`do {`
			`if (np < &buf[NUMSIZE]) {`
			`*np++ = ch;`
			`}`
			`LoadChar(ch);`
			`} while (is_dig(ch));`

			`if (is_hex(ch))`
			`goto S2;`
			`if (ch == 'H')`
			`goto Shex;`
			`if (ch == '.')`
			`goto Sreal;`
			`PushBack(ch);`
			`goto Sdec;`

			`case 'B':`
			`case 'C':`
			`if (np < &buf[NUMSIZE]) {`
			`*np++ = ch;`
			`}`
			`LoadChar(ch);`
			`if (ch == 'H')`
			`goto Shex;`
			`if (is_hex(ch))`
			`goto S2;`
			`PushBack(ch);`
			`ch = *--np;`
			`*np++ = '\0';`
			`/*`
			`* If (ch == 'C') type is a CHAR`
			`* else type is an INTEGER`
			`*/`
			`tk->TOK_INT = str2long(&buf[1], 8);`
			`return tk->tk_symb = INTEGER;`

			`case 'A':`
			`case 'D':`
			`case 'E':`
			`case 'F':`
			`S2:`
			`do {`
			`if (np < &buf[NUMSIZE]) {`
			`*np++ = ch;`
			`}`
			`LoadChar(ch);`
			`} while (is_hex(ch));`
			`if (ch != 'H') {`
			`lexerror("H expected after hex number");`
			`PushBack(ch);`
			`}`
			`goto Shex;`

			`case '.':`
			`Sreal:`
			`/* This '.' could be the first of the '..'`
			`token. At this point, we need a look-ahead`
			`of two characters.`
			`*/`
			`LoadChar(ch);`
			`if (ch == '.') {`
			`/* Indeed the '..' token`
			`*/`
			`PushBack(ch);`
			`PushBack(ch);`
			`goto Sdec;`
			`}`

			`/* a real constant */`
			`if (np < &buf[NUMSIZE]) {`
			`*np++ = '.';`
			`}`

			`if (is_dig(ch)) {`
			`/* Fractional part`
			`*/`
			`do {`
			`if (np < &buf[NUMSIZE]) {`
			`*np++ = ch;`
			`}`
			`LoadChar(ch);`
			`} while (is_dig(ch));`
			`}`

			`if (ch == 'E') {`
			`/* Scale factor`
			`*/`
			`if (np < &buf[NUMSIZE]) {`
			`*np++ = 'E';`
			`}`
			`LoadChar(ch);`
			`if (ch == '+' \|\| ch == '-') {`
			`/* Signed scalefactor`
			`*/`
			`if (np < &buf[NUMSIZE]) {`
			`*np++ = ch;`
			`}`
			`LoadChar(ch);`
			`}`
			`if (is_dig(ch)) {`
			`do {`
			`if (np < &buf[NUMSIZE]) {`
			`*np++ = ch;`
			`}`
			`LoadChar(ch);`
			`} while (is_dig(ch));`
			`}`
			`else {`
			`lexerror("bad scale factor");`
			`}`
			`}`

			`PushBack(ch);`

			`if (np == &buf[NUMSIZE + 1]) {`
			`lexerror("floating constant too long");`
			`tk->TOK_REL = Salloc("0.0", 5);`
			`}`
			`else {`
			`tk->TOK_REL = Salloc(buf, np - buf) + 1;`
			`}`
			`return tk->tk_symb = REAL;`

			`default:`
			`PushBack(ch);`
			`Sdec:`
			`*np++ = '\0';`
			`/* Type is an integer */`
			`tk->TOK_INT = str2long(&buf[1], 10);`
			`return tk->tk_symb = INTEGER;`
			`}`
			`/NOTREACHED/`
			`}`

			`case STEOI:`
some improvements 1986-03-24 17:29:57 +00:00			`return tk->tk_symb = -1;`
Initial version 1986-03-20 14:52:03 +00:00
			`case STCHAR:`
			`default:`
			`crash("bad character class %d", class(ch));`
			`}`
some improvements 1986-03-24 17:29:57 +00:00			`/NOTREACHED/`
Initial version 1986-03-20 14:52:03 +00:00			`}`