276 lines
		
	
	
	
		
			6.8 KiB
		
	
	
	
		
			Text
		
	
	
	
	
	
			
		
		
	
	
			276 lines
		
	
	
	
		
			6.8 KiB
		
	
	
	
		
			Text
		
	
	
	
	
	
.TL 
 | 
						|
A prototype Code expander
 | 
						|
.NH
 | 
						|
Introduction
 | 
						|
.PP
 | 
						|
A program to be compiled with ACK is first fed into the preprocessor.
 | 
						|
The output of the preprocessor goes into the appropiate front end,
 | 
						|
whose job it is to produce EM. The EM code generated is
 | 
						|
fed into the peephole optimizer, wich scans it with a window of few 
 | 
						|
instructions, replacing certain inefficient code sequences by better
 | 
						|
ones. Following the peephole optimizer follows a backend wich produces
 | 
						|
good assembly code. The assembly code goes into the assembler and the objectcode
 | 
						|
then goes into the loader/linker, the final component in the pipeline.
 | 
						|
.PP
 | 
						|
For various applications this scheme is too slow. For example for testing
 | 
						|
programs; In this case the program has to be translated fast and the 
 | 
						|
runtime of the objectcode may be slower. A solution is to build a code
 | 
						|
expander ( \fBce\fR) wich translates EM code to objectcode. Of course this 
 | 
						|
has to
 | 
						|
be done automaticly by a code expander generator, but to get some feeling
 | 
						|
for the problem we started out to build prototypes. 
 | 
						|
We built two types of ce's. One wich tranlated EM to assembly, one
 | 
						|
wich translated EM to objectcode.
 | 
						|
.NH
 | 
						|
EM to assembly
 | 
						|
.PP
 | 
						|
We made one for the 8086 and one for the vax4. These ce's are instances of the
 | 
						|
EM_CODE(3L)-interface and produce for a single EM instruction a set 
 | 
						|
of assembly instruction wich are semantic equivalent.
 | 
						|
We implemented in the 8086-ce push/pop-optimalization.
 | 
						|
.NH
 | 
						|
EM to objectcode
 | 
						|
.PP
 | 
						|
Instead of producing assembly code we tried to produce vax4-objectcode.
 | 
						|
During execution of ce, ce builds in core a machine independent
 | 
						|
objectfile ( NEW A.OUT(5L)) and just before dumping the tables this
 | 
						|
objectfile is converted to a Berkly 4.2BSD a.out-file. We build two versions;
 | 
						|
One with static memory allocation and one with dynamic memory allocation.
 | 
						|
If the first one runs out of memory it will give an error message and stop,
 | 
						|
the second one will allocate more memory and proceed with producing 
 | 
						|
objectcode.
 | 
						|
.PP
 | 
						|
The C-frontend calls the EM_CODE-interface. So after linking the frontend
 | 
						|
and the ce we have a pipeline in a program saving a lot of i/o.
 | 
						|
It is interesting to compare this C-compiler ( called fcemcom) with "cc -c". 
 | 
						|
fcemcom1 (the dynamic variant of fcemcom) is tuned in such a way, that
 | 
						|
alloc() won't be called.
 | 
						|
.NH 2
 | 
						|
Compile time
 | 
						|
.PP
 | 
						|
fac.c is a small program that produces n! ( see below). foo.c is small program
 | 
						|
that loops a lot.
 | 
						|
.TS
 | 
						|
center, box, tab(:);
 | 
						|
c | c | c | c | c | c
 | 
						|
c | c | n | n | n | n.
 | 
						|
compiler : program : real : user : sys : object size
 | 
						|
=
 | 
						|
fcemcom : sort.c : 31.0 : 17.5 : 1.8 : 23824
 | 
						|
fcemcom1 : : 59.0 : 21.2 : 3.3 : 
 | 
						|
cc -c : : 50.0 : 38.0 : 3.5 : 6788
 | 
						|
_
 | 
						|
fcemcom : ed.c : 37.0 : 23.6 : 2.3 : 41744
 | 
						|
fcemcom1 : : 1.16.0 : 28.3 : 4.6 : 
 | 
						|
cc -c : : 1.19.0 : 54.8 : 4.3 : 11108
 | 
						|
_
 | 
						|
fcemcom : cp.c :  4.0 : 2.4 : 0.8 : 4652
 | 
						|
fcemcom1 : : 9.0 : 3.0 : 1.0 : 
 | 
						|
cc -c : :  8.0 : 5.2 : 1.6 : 1048
 | 
						|
_
 | 
						|
fcemcom : uniq.c : 5.0 : 2.5 : 0.8 : 5568
 | 
						|
fcemcom1 : : 9.0 : 2.9 : 0.8 : 
 | 
						|
cc -c : : 13.0 : 5.4 : 2.0 : 3008
 | 
						|
_
 | 
						|
fcemcom : btlgrep.c : 24.0 : 7.2 : 1.4 : 12968
 | 
						|
fcemcom1 : : 23.0 : 8.1 : 1.2 : 
 | 
						|
cc -c : : 1.20.0 : 15.3 : 3.8 : 2392
 | 
						|
_
 | 
						|
fcemcom : fac.c : 1.0 : 0.1 : 0.5 : 216
 | 
						|
fecmcom1 : : 2.0 : 0.2 : 0.5 : 
 | 
						|
cc -c : : 3.0 : 0.7 : 1.3 : 92
 | 
						|
_
 | 
						|
fcemcom : foo.c : 4.0 : 0.2 : 0.5 : 272
 | 
						|
fcemcom1 : : 11.0 : 0.3 : 0.5 : 
 | 
						|
cc -c : : 7.0 : 0.8 : 1.6 : 108
 | 
						|
.TE
 | 
						|
.NH 2
 | 
						|
Run time
 | 
						|
.LP
 | 
						|
Is the runtime very bad?
 | 
						|
.TS
 | 
						|
tab(:), box, center;
 | 
						|
c | c | c | c | c
 | 
						|
c | c | n | n | n.
 | 
						|
compiler : program : real : user : system
 | 
						|
=
 | 
						|
fcem : sort.c : 22.0 : 17.5 : 1.5
 | 
						|
cc : : 5.0 : 2.4 : 1.1
 | 
						|
_
 | 
						|
fcem : btlgrep.c : 1.58.0 : 27.2 : 4.2
 | 
						|
cc : : 12.0 : 3.6 : 1.1
 | 
						|
_
 | 
						|
fcem : foo.c : 1.0 : 0.7 : 0.1
 | 
						|
cc : : 1.0 : 0.4 : 0.1
 | 
						|
_
 | 
						|
fcem : uniq.c : 2.0 : 0.5 : 0.3
 | 
						|
cc : : 1.0 : 0.1 : 0.2
 | 
						|
.TE
 | 
						|
.NH 2
 | 
						|
quality object code
 | 
						|
.LP
 | 
						|
The runtime is very bad so its interesting to have look at the code which is
 | 
						|
produced by fcemcom and by cc -c. I took a program which computes recursively
 | 
						|
n!.
 | 
						|
.DS
 | 
						|
long fac();
 | 
						|
 | 
						|
main()
 | 
						|
{
 | 
						|
	int n;
 | 
						|
 | 
						|
	scanf( "%D", &n); 
 | 
						|
	printf( "fac is %D\\\\n", fac( n));
 | 
						|
}
 | 
						|
 | 
						|
long fac( n)
 | 
						|
int n;
 | 
						|
{
 | 
						|
	if ( n == 0)
 | 
						|
		return( 1);
 | 
						|
	else
 | 
						|
		return( n * fac( n-1));
 | 
						|
}
 | 
						|
.DE
 | 
						|
.br
 | 
						|
.br
 | 
						|
.br
 | 
						|
.br
 | 
						|
.LP
 | 
						|
"cc -c fac.c" produces :
 | 
						|
.DS 
 | 
						|
fac:	tstl 4(ap)
 | 
						|
	bnequ 7f
 | 
						|
	movl $1, r0
 | 
						|
	ret
 | 
						|
7f:	subl3 $1, 4(ap), r0
 | 
						|
	pushl r0
 | 
						|
	call $1, fac
 | 
						|
	movl r0, -4(fp)
 | 
						|
	mull3 -4(fp), 4(ap), r0
 | 
						|
	ret
 | 
						|
.DE
 | 
						|
.br
 | 
						|
.br
 | 
						|
.LP
 | 
						|
"fcem fac.c fac.o" produces :
 | 
						|
.DS 
 | 
						|
_fac:		0
 | 
						|
42:		jmp	be
 | 
						|
48:		pushl	4(ap)
 | 
						|
4e:		pushl	$0
 | 
						|
54:		subl2	(sp)+,(sp)
 | 
						|
57:		tstl	(sp)+
 | 
						|
59:		bnequ	61
 | 
						|
5b:		jmp	67
 | 
						|
61:		jmp	79
 | 
						|
67:		pushl	$1
 | 
						|
6d:		jmp	ba
 | 
						|
73:		jmp	b9
 | 
						|
79:		pushl	4(ap)
 | 
						|
7f:		pushl	$1
 | 
						|
85:		subl2	(sp)+,(sp)
 | 
						|
88:		calls	$0,_fac
 | 
						|
8f:		addl2	$4,sp
 | 
						|
96:		pushl	r0
 | 
						|
98:		pushl	4(ap)
 | 
						|
9e:		pushl	$4
 | 
						|
a4:		pushl	$4
 | 
						|
aa:		jsb	.cii
 | 
						|
b0:		mull2	(sp)+,(sp)
 | 
						|
b3:		jmp	ba
 | 
						|
b9:		ret
 | 
						|
ba:		movl	(sp)+,r0
 | 
						|
bd:		ret
 | 
						|
be:		jmp	48
 | 
						|
.DE
 | 
						|
.NH 1
 | 
						|
Conclusions
 | 
						|
.PP
 | 
						|
comparing "cc -c" with "fcemcom"
 | 
						|
.LP
 | 
						|
.TS
 | 
						|
center, box, tab(:);
 | 
						|
c | c  s | c | c  s
 | 
						|
^ | c  s | ^ | c  s
 | 
						|
^ | c | c | ^ | c | c
 | 
						|
l | n | n | n | n | n.
 | 
						|
program : compile time : object size : runtime
 | 
						|
:_::_
 | 
						|
: user : sys :: user : sys
 | 
						|
=
 | 
						|
sort.c : 0.47 : 0.5 : 3.5 : 7.3 : 1.4
 | 
						|
_
 | 
						|
ed.c : 0.46 : 0.5 : 3.8 : : :
 | 
						|
_
 | 
						|
cp.c : 0.46 : 0.5 : 4.4 : : :
 | 
						|
_
 | 
						|
uniq.c : 0.46 : 0.4 : 1.8 : : :
 | 
						|
_
 | 
						|
btlgrep.c : 0.47 : 0.3 : 5.4 : 7.5 : 3.8
 | 
						|
_
 | 
						|
fac.c : 0.14 : 0.4 : 2.3 : 1.8 : 1.0
 | 
						|
_
 | 
						|
foo.c : 0.25 : 0.3 : 2.5 : 5.0 : 1.5
 | 
						|
.TE
 | 
						|
.PP
 | 
						|
The results for fcemcom1 are almost identical; The only thing that changes
 | 
						|
is that fcemcom1 is 1.2 slower than fcemcom. ( compile time) This is due to
 | 
						|
to an another datastructure . In the static version we use huge array's for 
 | 
						|
the text- and 
 | 
						|
data-segment, the relocation information, the symboltable and stringarea.
 | 
						|
In the dynamic version we use linked lists, wich makes it expensive to get
 | 
						|
and to put a byte on a abritrary memory location. So it is probably better
 | 
						|
to use realloc(), because in the most cases there will be enough memory. 
 | 
						|
.PP
 | 
						|
The quality of the objectcode is very bad. The reason is that the frontend
 | 
						|
generates bad code and expects the peephole-optimizer to improve the code.
 | 
						|
This is also one of the main reasons that the runtime is very bad.
 | 
						|
(e.g. the expensive "cii" with arguments 4 and 4 could be deleted.) 
 | 
						|
So its seems a good
 | 
						|
idea to put a new peephole-optimizer between the frontend and the ce.
 | 
						|
.PP
 | 
						|
Using the peephole optimizer the ce would produce :
 | 
						|
.DS
 | 
						|
_fac:	0
 | 
						|
	pushl	4(ap)
 | 
						|
	tstl	(sp)+
 | 
						|
	beqlu	1f
 | 
						|
	jmp	3f
 | 
						|
 1 :	pushl	$1
 | 
						|
	jmp	2f
 | 
						|
 3 :	pushl	4(ap)
 | 
						|
	decl	(sp)
 | 
						|
	calls	$0,_fac
 | 
						|
	addl2	$4,sp
 | 
						|
	pushl	r0
 | 
						|
	pushl	4(ap)
 | 
						|
	mull2	(sp)+,(sp)
 | 
						|
	movl	(sp)+,r0
 | 
						|
  2 :   ret
 | 
						|
.DE
 | 
						|
.PP
 | 
						|
Bruce McKenzy already implemented it and made some improvements in the
 | 
						|
source code of the ce. The compile-time is two to two and a half times better 
 | 
						|
and the
 | 
						|
size of the objectcode is two to three times bigger.(comparing with "cc -c")
 | 
						|
Still we could do better.
 | 
						|
.PP
 | 
						|
Using peephole- and push/pop-optimization ce could produce :
 | 
						|
.DS 
 | 
						|
_fac:		0
 | 
						|
	tstl	4(ap)
 | 
						|
	beqlu	1f
 | 
						|
	jmp	2f
 | 
						|
  1 :	pushl	$1
 | 
						|
	jmp	3f
 | 
						|
  2 :	decl	4(ap)
 | 
						|
	calls	$0,_fac
 | 
						|
	addl2	$4,sp
 | 
						|
	mull3	4(ap), r0, -(sp)
 | 
						|
	movl 	(sp)+, r0
 | 
						|
  3 : 	ret
 | 
						|
.DE
 | 
						|
.PP
 | 
						|
prof doesn't cooperate, so no profile information.
 | 
						|
.PP
 |