284 lines
		
	
	
	
		
			9.3 KiB
		
	
	
	
		
			Text
		
	
	
	
	
	
			
		
		
	
	
			284 lines
		
	
	
	
		
			9.3 KiB
		
	
	
	
		
			Text
		
	
	
	
	
	
| .TL
 | |
| 
 | |
| Code Expander
 | |
| .br
 | |
| (proposal)
 | |
| 
 | |
| .SH
 | |
| Introduction
 | |
| .LP
 | |
| The \fBcode expander\fR, \fBce\fR, is a program that translates EM-code to
 | |
| objectcode. The main goal is to translate very fast. \fBce\fR is an instance
 | |
| of the EM_CODE(3L)-interface. During execution of \fBce\fR, \fBce\fR will build
 | |
| in core a machine independent objectfile ( NEW A.OUT(5L)). With \fBcv\fR or 
 | |
| with routines supplied by the user the machine independent objectcode will
 | |
| be converted to a machine dependent object code. \fBce\fR needs 
 | |
| information about the targetmachine (e.g. the opcode's). We divide the
 | |
| information into two parts:
 | |
| .IP
 | |
| - The description in assembly instructions of EM-code instructions.
 | |
| .IP
 | |
| - The description in objectcode of assembly instructions.
 | |
| .LP
 | |
| With these two tables we can make a \fBcode expander generator\fR which
 | |
| generates a \fBce\fR. It is possible to put the information in one table
 | |
| but that will probably introduce (propable) more bugs in the table. So we 
 | |
| divide and conquer.  With this approach it is also possible to generate 
 | |
| assembly code ( rather yhan objectcode), wich is useful for debugging.
 | |
| There is of course a link between the two tables, the link 
 | |
| consist of a restriction on the assembly format. Every assembly
 | |
| instruction must have the following format:
 | |
| .sp
 | |
| 	INSTR ::= LABEL : MNEMONIC  [ OPERAND ( "," OPERAND)* ]
 | |
| .sp
 | |
| .LP
 | |
| \fBCeg\fR uses the following algorithm:
 | |
| .IP \0\0a)
 | |
| The assembly table will be converted to a (C-)routine assemble().
 | |
| assemble() gets as argument a string, the assembler instruction,
 | |
| and can use the MNEMONIC to execute the corresponding action in the 
 | |
| assembly table.
 | |
| .IP \0\0b)
 | |
| The routine assemble() can now be used to convert the EM-code table to
 | |
| a set of C-routines, wich together form an instance of the
 | |
| EM_CODE(3L).
 | |
| .SH
 | |
| The EM-instruction table
 | |
| .LP
 | |
| We use the following grammar:
 | |
| .sp
 | |
| .TS
 | |
| center box ;
 | |
| l.
 | |
| TABLE ::= (ROW)*
 | |
| ROW   ::= C_instr ( SPECIAL | SIMPLE)
 | |
| SPECIAL ::= ( CONDITION SIMPLE)+  'default'  SIMPLE
 | |
| SIMPLE ::= '==>' ACTIONLIST | '::=' ACTIONLIST
 | |
| ACTIONLIST ::= [ ACTION ( ';' ACTION)* ] '.'
 | |
| ACTION ::= function-call | assembly-instruction
 | |
| .TE
 | |
| .LP
 | |
| An example for the 8086:
 | |
| .LP
 | |
| .DS
 | |
| C_lxl
 | |
| 	$arg1 == 0  ==>  "push bp".
 | |
| 	$arg1 == 1  ==>  "push EM_BSIZE(bp)".
 | |
|         default     ==>  "mov cx, $arg1";
 | |
| 		         "mov si, bp";
 | |
| 		         "1: mov si, EM_BSIZE(si);
 | |
| 		         "loop 1b"
 | |
| 		         "push si".
 | |
| .DE
 | |
| .sp
 | |
| Some remarks:
 | |
| .sp
 | |
| * The C_instr is a function indentifier in the EM_CODE(3L)-interface.
 | |
| .LP
 | |
| * CONDITION is a "boolean" C-expression. 
 | |
| .LP
 | |
| * The arguments of an EM-instruction can be used in CONDITION and in assembly
 | |
| instructions. They are referred by $arg\fIi\fR. \fBceg\fR modifies the 
 | |
| arguments as follows:
 | |
| .IP \0\0-
 | |
| For local variables at positive offsets it increases this offset by EM_BSIZE
 | |
| .IP \0\0-
 | |
| It makes names en labels unique. The user must supply the formats (see mach.h).
 | |
| .LP
 | |
| * function-call is allowed to implement e.g. push/pop optimization.
 | |
| For example:
 | |
| .LP
 | |
| .DS
 | |
| C_adi   
 | |
| 	$arg1 == 2   ==> combine( "pop ax");
 | |
| 		 	 combine( "pop bx");
 | |
| 		 	 "add ax, bx";
 | |
|                          save( "push ax").
 | |
|         default      ==> arg_error( "C_adi", $arg1).
 | |
| .DE
 | |
| .LP
 | |
| * The C-functions called in the EM-instructions table have to use the routine
 | |
| assemble()/gen?(). "assembler-instr" is in fact assemble( "assembler-instr").
 | |
| .LP
 | |
| * \fBceg\fR takes care not only about the conversions of arguments but also 
 | |
| about
 | |
| changes between segments. There are situation when one doesn't want 
 | |
| conversion of arguments. This can be done by using ::= in stead of ==>.
 | |
| This is usefull when two C_instr are equivalent. For example:
 | |
| .IP
 | |
| C_slu  ::=  C_sli( $arg1)
 | |
| .LP
 | |
| * There are EM-CODE instructions wich are machine independent (e.g. C_open()).
 | |
| For these EM_CODE instructions \fBceg\fR will generate \fIdefault\fR-
 | |
| instructions. There is one exception: in the case of C_pro() the tablewriter 
 | |
| has to supply a function prolog().
 | |
| .LP
 | |
| * Also the EM-pseudoinstructions C_bss_\fIcstp\fR(), C_hol_\fIcstp\fR(),
 | |
| C_con_\fIcstp\fR() and C_rom_\fIcstp\fR can be translated automaticly.
 | |
| \fBceg\fR only has to know how to interpretate string-constants: 
 | |
| .DS
 | |
| \&..icon  $arg2 == 1  ==>  gen1( (char) atoi( $arg1))
 | |
|           $arg2 == 2  ==>  gen2( atoi( $arg1))
 | |
|           $arg2 == 4  ==>  gen4( atol( $arg1))
 | |
| \&..ucon  $arg2 == 1  ==>  gen1( (char) atoi( $arg1))
 | |
| 	  $arg2 == 2  ==>  gen2( atoi( $arg1))
 | |
|     	  $arg2 == 4  ==>  gen4( atol( $arg1))
 | |
| \&..fcon  ::=  not_implemented( "..fcon")
 | |
| .DE
 | |
| .LP
 | |
| * Still, life can be made easier for the tablewriter; For the routines wich 
 | |
| he/she didn't implement \fBceg\fR will generate a default instruction wich
 | |
| generates an error-message. \fBceg\fR seems to generate :
 | |
| .IP
 | |
| C_xxx  ::=  not_implemented( "C_xxx")
 | |
| .SH
 | |
| The assembly table
 | |
| .LP
 | |
| How to map assembly on objectcode.
 | |
| .LP
 | |
| Each row in the table consists of two fields, one field for the assembly
 | |
| instruction, the other field for the corresponding objectcode. The tablewriter
 | |
| can use the following primitives to generate code for the machine
 | |
| instructions :
 | |
| .IP "\0\0gen1( b)\0\0:" 17
 | |
| generates one byte in de machine independent objectfile.
 | |
| .IP "\0\0gen2( w)\0\0:" 17
 | |
| generates one word ( = two bytes), the table writer can change the byte
 | |
| order by setting the flag BYTES_REVERSED.
 | |
| .IP "\0\0gen4( l)\0\0:" 17
 | |
| generates two words ( = four bytes), the table writer can change the word
 | |
| order by setting the flag WORDS_REVERSED.
 | |
| .IP "\0\0reloc( n, o, r)\0\0:" 17
 | |
| generates relocation information for a label ( = name + offset +
 | |
| relocationtype).
 | |
| .LP
 | |
| Besides these primitives the table writer may use his self written
 | |
| C-functions. This allows the table writer e.g. to write functions to set
 | |
| bitfields within a byte.
 | |
| .LP
 | |
| There are more or less two methods to encode the assembly instructions:
 | |
| .IP \0\0a)
 | |
| MNEMONIC and OPERAND('s) are encoded independently of each other. This can be
 | |
| done when the target machine has an orthogonal instruction set (e.g. pdp-11).
 | |
| .IP \0\0b)
 | |
| MNEMONIC and OPERAND('s) together determine the opcode. In this case the
 | |
| assembler often uses overloading: one MNEMONIC is used for several
 | |
| different machine-instructions. For example : (8086)
 | |
| .br
 | |
| 	mov ax, bx
 | |
| .br
 | |
| 	mov ax, variable
 | |
| .br
 | |
| These instructions have different opcodes.
 | |
| .LP
 | |
| As the transformation MNEMONIC-OPCODE is not one to
 | |
| one the table writer must be allowed to put restrictions on the operands.
 | |
| This can be done with type declarations. For example:
 | |
| .LP
 | |
| .DS
 | |
| 	mov  dst:REG, src:MEM  ==>
 | |
| 		gen1( 0x8b);
 | |
| 		modRM( op2.reg, op1);
 | |
| .DE
 | |
| .DS
 | |
| 	mov  dst:REG, src:REG  ==>
 | |
| 		gen1( 0x89);
 | |
| 		modRM( op2.reg, op1);
 | |
| .DE
 | |
| .LP
 | |
| modRM() is a function written by the tablewriter and is used to encode
 | |
| the operands. This frees the table writer of endless typing.
 | |
| .LP
 | |
| The table writer has to do the "typechecking" by himself. But typechecking
 | |
| is almost the same as operand decoding. So it's more efficient to do this
 | |
| in one function. We now have all the tools to describe the function
 | |
| assemble(). 
 | |
| .IP
 | |
| assemble() first calls the function
 | |
| decode_operand() ( by the table writer written), with two arguments: a 
 | |
| string ( the operand) and a
 | |
| pointer to a struct. The struct is declared by the table writer and must
 | |
| consist of at least a field called type. ( the other fields in the struct can
 | |
| be used to remember information about the decoded operand.) Now assemble()
 | |
| fires a row wich is selected by mapping the MNEMONIC and the type of the 
 | |
| operands. 
 | |
| .br
 | |
| In the second field of a row there may be references to other
 | |
| fields in the struct (e.g. op2.reg in the example above).
 | |
| .LP
 | |
| We ignored one problem. It's possible when the operands are encoded, that
 | |
| not everything is known. For example $arg\fIi\fR arguments in the
 | |
| EM-instruction table get their value at runtime. This problem is solved by
 | |
| introducing a function eval(). eval() has a string as argument and returns
 | |
| an arith. The string consists of constants and/or $arg\fIi\fR's and the value
 | |
| returned by eval() is the value of the string. To encode the $arg\fIi\fR's
 | |
| in as few bytes as possible the table writer can use the statements %if,
 | |
| %else and %endif. They can be used in the same manner as #if, #else and
 | |
| #endif in C and result in a runtime test. An example : 
 | |
| .LP
 | |
| .DS
 | |
|  -- Some rows of the assembly table
 | |
|  
 | |
|  mov dst:REG, src:DATA  ==>
 | |
|         %if  sfit( eval( src), 8)   /* does the immediate-data fit in 1 byte? */
 | |
|  		R53( 0x16 , op1.reg);
 | |
|  	     	gen1( eval( src));
 | |
|         %else
 | |
|  	      	R53( 0x17 , op1.reg);
 | |
|  	      	gen2( eval( src));
 | |
|         %endif
 | |
| .LD
 | |
|  
 | |
|  mov dst:REG, src:REG  ==>
 | |
|         gen1( 0x8b);
 | |
|         modRM( op1.reg, op2);
 | |
|  
 | |
| .DE 
 | |
| .DS
 | |
|  -- The corresponding part in the function assemble() :
 | |
|  
 | |
|  case MNEM_mov : 
 | |
|  		decode_operand( arg1, &op1);
 | |
|  		decode_operand( arg2, &op2);
 | |
|  		if ( REG( op1.type) && DATA( op2.type)) {
 | |
|  			printf( "if ( sfit( %s, 8)) {\\\\n", eval( src));
 | |
|  			R53( 0x16 , op1.reg);
 | |
|  			printf( "gen1( %s)\\\\n", eval( arg2));
 | |
|  			printf( "}\\\\nelse {\\\\n");
 | |
|  			R53( 0x17 , op1.reg);
 | |
|  			printf( "gen2( %s)\\\\n", eval( arg2));
 | |
|  			printf( "}\\\\n");
 | |
|  		}
 | |
|  		else if ( REG( op1.type) && REG( op2.type)) {
 | |
|  			gen1( 0x8b);
 | |
|  			modRM( op1.reg, op2);
 | |
|  		}
 | |
|  
 | |
|  
 | |
| .DE
 | |
| .DS
 | |
|  -- Some rows of the right part of the EM-instruction table are translated
 | |
|  -- in the following C-functions.
 | |
| 
 | |
|  "mov ax, $arg1" ==>
 | |
|  	if ( sfit( w, 8)) {	/* w is the actual argument of C_xxx( w) */
 | |
|  		gen1( 176);	/* R53() */
 | |
|  		gen1( w);
 | |
|  	}
 | |
|  	else {
 | |
|  		gen1( 184);
 | |
|  		gen2( w);
 | |
|  	}
 | |
| .LD 
 | |
| 
 | |
|  "mov ax, bx"    ==> 
 | |
|  	gen1( 138);
 | |
|   	gen1( 99);		/* modRM() */
 | |
| .DE
 | |
| .SH
 | |
| Restrictions
 | |
| .LP
 | |
| .IP \0\01)
 | |
| The EM-instructions  C_exc() is not implemented.
 | |
| .IP \0\03)
 | |
| All messages are ignored.
 |