2
\$\begingroup\$

Starting from this answer I thought I would show a fully worked example of how to create a data-driven version of the assembler for the Hack assembly language. As I noted in that answer, having things data driven also provides the opportunity to create a disassembler using the same data structure. This code implements both. I had intended to create some test code in C++ to exercise parts of this, but ran out of time to do so, so there may be some remaining bugs. This is the reason that some of the functions are declared as global and begin with an underscore. Here's a sample input file:

source.hack

 @16
 M=1
 @17
 M=0
 @16
 D=M
 @0
 D=D-M
 @18
 D;JGT
 @16
 D=M
 @17
 M=D+M
 @16
 M=M+1
 @4
 0;JMP
 @17
 D=M

Example of use:

hack input.hack > input.bin
hack input.bin d > reconstructed.hack

Note that to specify either operation (assemble/disassemble) we need to supply a file name. To specify disassembly, an additional command line argument is given. The contents of it are not used; just the presence of an additional argument is what triggers disassembly.

It also makes no effort to diagnose or even detect malformed or invalid input lines.

hack.asm

BUFFSIZE equ 16384
; some character constants
NEWLINE equ 10
SPACE equ 32
TAB equ 9
; some syscall constants
SYSREAD equ 0
SYSWRITE equ 1
SYSOPEN equ 2
; default file handles
STDIN equ 0
STDOUT equ 1
STDERR equ 2
; exit takes an exit error code
%macro exit 1
 mov edi, %1
 mov eax, 60
 syscall
%endmacro
; openfile takes an ASCIIZ filename
%macro openfile 1
 mov rdi, %1
 xor esi, esi ; read only
 xor edx, edx ; no create
 mov eax, SYSOPEN
 syscall
%endmacro
; readfile takes a file descriptor, buffer pointer, and buffer size
%macro readfile 3
 mov rdi, %1
 mov rsi, %2
 mov rdx, %3
 mov eax, SYSREAD
 syscall
%endmacro
; print takes the file handle, a pointer to the string, and a length
%macro print 3
 mov rdi, %1 ; print to passed file handle
 mov rsi, %2 ; pointer to buffer
 mov rdx, %3 ; length of buffer
 mov eax, SYSWRITE
 syscall
%endmacro
; takes mask, and pointer to table as arguments
; ENTRY:
; rax : current opcode to be decoded
; rdi : pointer to current output
; EXIT:
; rdi : updated current output pointer
; TRASHED:
; rbx
; rcx
; rsi
; r10
%macro decodeOpcodePart 2
 mov eax, ebx
 and eax, %1
 mov r10, %2
 call _reverseTableLookup
 jc %%nextpart
 xor rcx,rcx
 mov cx, word [r10 + Operation.strlen]
 lea esi, [r10 + Operation.string]
 rep movsb
 %%nextpart:
%endmacro
; takes mask, and pointer to table as arguments
; ENTRY:
; rax : current partially encoded opcode
; EXIT:
; rax : updated opcode
; TRASHED:
; rdi
%macro encodeOpcodePart 1
 mov edi, %1
 call _tableLookup
 or ax, word[edi + Operation.value]
%endmacro
 
 
section .bss
 buffer resb BUFFSIZE
 outbuf resw BUFFSIZE/2
 line resb 10
section .data
 err_no_args db "Error: no input filename given on command line", 10
 err_no_args_len equ $ - err_no_args
 err_open db "Error: could not open file",10
 err_open_len equ $ - err_open
 err_read db "Error: could not read from file", 10
 err_read_len equ $ - err_read
 err_zero db "Error: read zero words from file", 10
 err_zero_len equ $ - err_zero
 err_write db "Error: could not write to output",10
 err_write_len equ $ - err_write
section .text
 global _start
 global _tableLookup
 global _stringToNumber
 ; look up a string token in a table and return the corresponding value
 ;
 ; ENTRY:
 ; esi : points to strz token
 ; edi : points to Operation table
 ; EXIT:
 ; cy : set if not found in table
 ; edi : points to matching entry if found
 ; esi : points to one char after matched token if found
 ; TRASHED:
 ; rbx
 _tableLookup:
 mov rbx, rsi ; save original pointer to token
 .looptop:
 movzx ecx, word [edi + Operation.strlen]
 push rdi
 repe cmpsb
 pop rdi
 je .found
 add edi, Operation_size
 cmp word [edi + Operation.strlen],0
 mov rsi,rbx
 jne .looptop
 stc
 .found:
 ret
 ; look up a a value and construct the corresponding string
 ;
 ; ENTRY:
 ; eax : binary value to search for
 ; r10 : points to Operation table
 ; EXIT:
 ; cy : set if not found in table
 ; r10 : points to matching entry if found
 ; TRASHED:
 _reverseTableLookup:
 .looptop:
 cmp ax, word [r10 + Operation.value]
 je .found
 add r10, Operation_size
 cmp word [r10 + Operation.strlen],0
 jne .looptop
 stc
 .found:
 ret
 ; given a pointer to a decimal number string, convert to the value in eax
 ; ENTRY:
 ; esi : points to strz decimal number string
 ; EXIT:
 ; eax : contains converted value
 ; esi : points to one past ending character
 ; TRASHED:
 ; ebx, ecx
 _stringToNumber:
 push rbx
 push rcx
 xor eax, eax
 xor ebx, ebx
 xor ecx, ecx
 inc ecx ; default multiplier is +1
 lodsb
 cmp al, '-' ; is it a leading - sign?
 jnz .numeric
 dec ecx
 dec ecx
 .numeric:
 sub al, '0'
 jb .done
 cmp al, 10
 jnb .done
 imul ebx, 10
 add ebx, eax
 lodsb
 jmp .numeric
 .done:
 imul ebx, ecx
 mov eax, ebx
 pop rcx
 pop rbx
 ret
 ; given a 16-bit number in ax, convert to string in line
 ; ENTRY:
 ; rax : number to convert (high 48 bits must be zero)
 ; rdi : points to destination buffer
 ; 
 ; EXIT:
 ; rdi : points to one past end of written string
 ; TRASHES:
 ; rbx
 ; 
 _numberToString:
 push rsi
 push rcx
 mov rsi, rdi
 mov bx,10
 add rdi,5
 .loopy:
 dec rdi
 xor edx,edx
 div bx
 add dl, '0'
 mov byte [rdi],dl
 test rax,rax
 jnz .loopy
 ; compute 5 - (rdi - rsi) = rsi - rdi + 5 = rsi + 5 - rdi
 lea rcx, [rsi + 5]
 sub rcx, rdi
 xchg rsi, rdi
 rep movsb
 pop rcx
 pop rsi
 ret
 ; copy just one assembly line from source to dest buffer, omitting whitespace
 ; ENTRY:
 ; esi : pointer to source
 ; edi : pointer to destination
 ; eax : number of bytes in source
 ; EXIT:
 ; esi : points to start of next line in source (if any)
 ; edi : points one past end of copied line
 ; eax : bytes remaining in source
 ; TRASHED:
 ; none
 getline:
 push rcx
 mov ecx, eax
 .top:
 lodsb
 cmp al, NEWLINE
 jz .done
 cmp al, SPACE ; skip spaces
 jz .skipws
 cmp al, TAB ; skip tabs, too
 jz .skipws
 stosb ; neither of those, so store it
 .skipws:
 loop .top
 jmp .exit
 .done:
 xor eax,eax ; write terminating NUL char
 stosb
 dec ecx
 .exit:
 mov eax, ecx
 pop rcx
 ret
 _start:
 mov rax, [rsp] ; load argc into rax
 mov edx, err_no_args
 mov ebx, err_no_args_len
 cmp eax, 2 ; have to have at least one argument
 js error_exit
 openfile [rsp+16] ; try opening the file named in the argument
 mov edx, err_open
 mov ebx, err_open_len
 cmp eax,0
 js error_exit
 readfile rax, buffer, BUFFSIZE ; read the entire file into memory
 mov edx, err_read
 mov ebx, err_read_len
 cmp eax,0
 js error_exit
 je no_bytes
 main:
 mov rdi, outbuf ; rdi points to current output buffer
 cmp qword [rsp], 3 ; if there are two arguments, we are disassembling
 jne assemble
 mov ecx, eax
 shr rcx, 1 ; convert from num bytes to num words
 jz no_bytes
 disassemble:
 ; ecx = number of remaining bytes in source
 ; esi = pointer to source buffer
 ; rdi = current pointer to output buffer
 lodsw ; get the next word to disassemble
 push rsi
 push rcx ; remember updated count and pointer
 test ah, 0x80 ; is this a type A instruction?
 jnz .decode_c
 mov byte [rdi], '@' ; it's a type A which is "@nnnn"
 inc rdi
 call _numberToString
 jmp .emitLine
 ; it's a type C instruction of the form "dest=comp;jmp"
 .decode_c:
 mov ebx, eax ; save opcode in ebx
 decodeOpcodePart DESTMASK, hackdest
 decodeOpcodePart COMPMASK, hackcomp
 decodeOpcodePart JUMPMASK, hackjump
 ; at this point we expect rcx (input len) and rsi (input ptr) on the stack,
 ; rdi : points to current end of output buffer
 .emitLine:
 pop rcx
 pop rsi
 mov al,NEWLINE
 stosb
 dec rcx
 jnz disassemble
 jmp write_output
 assemble:
 ; eax = number of remaining bytes in source
 ; esi = pointer to source buffer
 ; rdi = current pointer to output buffer
 mov r10, rdi ; temporarily save output pointer
 mov edi, line
 call getline ; copy a line, omitting whitespace
 push rax
 push rsi
 mov rsi, line
 cmp byte [rsi], '@' ; is this a type A instruction?
 jnz .c_instruction ; if not, assume it's type C
 inc rsi
 call _stringToNumber
 jmp .storeOpcode
 .c_instruction:
 ; it's a C instruction
 ; rsi is already pointing to line
 xor eax, eax
 encodeOpcodePart hackdest
 encodeOpcodePart hackcomp
 encodeOpcodePart hackjump
 .storeOpcode:
 mov rdi, r10 ; restore output pointer
 stosw
 pop rsi
 pop rax
 test rax,rax
 jne assemble
 write_output:
 ; rdi = current pointer to output buffer
 mov rdx, rdi
 sub rdx, outbuf
 print STDOUT, outbuf, rdx
 mov edx, err_write
 mov ebx, err_write_len
 cmp eax,0
 js error_exit
 exit 0
 no_bytes:
 mov edx, err_zero ; if the file was empty, it's an error
 mov ebx, err_zero_len
 ; fall through to error_exit
 error_exit:
 print STDERR, rdx, rbx
 exit 1
section .data
struc Operation
 .string: resb 8
 .strlen: resw 1
 .value: resw 1
endstruc
; opcode "string", value
%macro opcode 2
 istruc Operation
 at Operation.string, db %1
 %strlen len %1
 %rep 8 - len
 db 0
 %endrep
 at Operation.strlen, dw len
 at Operation.value, dw %2
 iend
%endmacro
DESTMASK equ 0x38
COMPMASK equ 0xffc0
JUMPMASK equ 0x07
hackdest:
 opcode "AMD=", 0x38
 opcode "AD=", 0x30
 opcode "AM=", 0x28
 opcode "MD=", 0x18
 opcode "M=", 0x08
 opcode "D=", 0x10
 opcode "A=", 0x20
 opcode "", 0
hackcomp:
 opcode "D+1", 0xe7c0
 opcode "A+1", 0xedc0
 opcode "M+1", 0xfdc0
 opcode "D-1", 0xe380
 opcode "A-1", 0xec80
 opcode "M-1", 0xfc80
 opcode "D+A", 0xe080
 opcode "D+M", 0xf080
 opcode "D-A", 0xe4c0
 opcode "D-M", 0xf4c0
 opcode "A-D", 0xe1c0
 opcode "M-D", 0xf1c0
 opcode "D&A", 0xe000
 opcode "D&M", 0xf000
 opcode "D|A", 0xe540
 opcode "D|M", 0xf540
 opcode "-1", 0xee80
 opcode "!D", 0xe340
 opcode "!A", 0xec40
 opcode "!M", 0xfc40
 opcode "-D", 0xe3c0
 opcode "-A", 0xecc0
 opcode "-M", 0xfcc0
 opcode "0", 0xea80
 opcode "1", 0xefc0
 opcode "D", 0xe300
 opcode "A", 0xec00
 opcode "M", 0xfc00
 opcode "", 0
hackjump:
 opcode ";JGT", 0x1
 opcode ";JEQ", 0x2
 opcode ";JGE", 0x3
 opcode ";JLT", 0x4
 opcode ";JNE", 0x5
 opcode ";JLE", 0x6
 opcode ";JMP", 0x7
 opcode "", 0
```
asked Jun 27, 2022 at 16:09
\$\endgroup\$
2
  • \$\begingroup\$ This is for Linux, I assume based on the syscall numbers. It's weird that you'd prefix your symbol names with _; ELF systems don't do that. e.g. C main's asm symbol name is main. (_start is part of the implementation and thus uses a reserved name starting with a leading underscore to avoid polluting the global namespace.) \$\endgroup\$ Commented Jul 1, 2022 at 0:50
  • \$\begingroup\$ Also I'm guessing a Linux non-PIE executable with static data since you use 32-bit addressing modes like [edi + Operation.strlen] instead of just requiring your caller to pass a 64-bit pointer as a starting point. Dynamic or stack memory will be outside the low 32 bits of virtual address space, unless you built this for the x32 ABI, ILP32 in long mode. You use 64-bit r10 in another function, rather than 32-bit r10d, so that instruction only needs a REX prefix, not also a 67 operand-size override to truncate the address. \$\endgroup\$ Commented Jul 1, 2022 at 1:00

1 Answer 1

1
\$\begingroup\$

Commenting is good

There are a number of inaccuracies though in the documentation, several of these are probably related to copy-pasting:

  • In the decodeOpcodePart macro the current opcode to be decoded is in RBX (so not in RAX), and it is RAX that gets trashed (so not RBX).
  • The encodeOpcodePart macro mentions that it 'takes mask, and pointer to table as arguments' where in fact there's but a single pointer argument.
  • The _tableLookup subroutine forgets to mention that it additionally trashes RCX.
  • The _stringToNumber subroutine mentions that RBX and RCX get trashed where in fact both these registers are preserved on the stack.
  • The _numberToString subroutine mentions that on exit RDI 'points to one past end of written string' where in fact RDI points just past the end of the written string (so not 'one past' like it is the case in the _stringToNumber subroutine).
  • The _numberToString subroutine forgets to mention that it additionally trashes RAX and RDX.
  • At the disassemble label ECX holds the number of remaining words in source (so not bytes).

For ease of reviewing, the TRASHED displays of the decode and encode macros should include the registers that get trashed in the subroutines that they call. Same goes for the ENTRY and EXIT displays.

Avoiding the use of "magic numbers"

In your answer to the 'rags' question you advocate to avoid the use of "magic numbers", but in adding the missing definition for exit, you introduced your own magic number mov eax, 60. Perhaps use SYSTERM equ 60.

Fix the bug

In the _stringToNumber subroutine you negate your final multiplier (+1 -> -1) if a minus character was found in the text. However you forget to load the character that follows the minus. That has to be the first digit that the .numeric loop processes.

Missed optimizations

  • When, in the _stringToNumber subroutine you negate your final multiplier (+1 -> -1), you use dec ecx dec ecx which takes 4 bytes, whereas a single neg ecx would do it in 2 bytes.
  • When, in the _stringToNumber subroutine you validate and convert a digit, you can safely remove the jb .done instruction. The byte wraparound will take care of it in the cmp al, 10 that immediately follows.
  • All occurencies of cmp eax, 0 can be written as test eax, eax which has a 1 byte shorter encoding, at least if you assemble with -Ox. Without optimizations it would be a lot worse.
  • Because the high dword of RCX is zeroed, we can replace the 3-byte shr rcx, 1 instruction by the 2-byte shr ecx, 1 instruction.
  • It is 1 byte shorter to write test ah, ah js .decode_c instead of test ah, 0x80 jnz .decode_c.
  • It is 4 bytes shorter to temporarily save the output pointer via push rdi ... pop rdi than to use mov r10, rdi ... mov rdi, r10.
  • If mov edi, line works fine, then mov rsi, line can safely become mov esi, line. This shaves off the REX prefix and opens up a world of similar REX-related optimizations that sometimes you do and sometimes you don't. I can't quite decide whether you care about codesize, execution speed, both, or none.

Use efficient instructions

Intel advices against using complex instructions like loop. In _getline you could replace loop .top with dec ecx jnz .top.

Think carefully about jumps

In your answer to the 'rags' question you warn about jumps being a relatively costly operation. Right so, but in the _stringToNumber subroutine you commit the same crime.
Next is my optimized rewrite of this conversion:

; given a pointer to a decimal number string, convert to the value in rax
; ENTRY:
; rsi : points to strz decimal number string
; EXIT:
; rax : contains converted value
; rsi : points to past ending character
; TRASHED:
; rdx
_stringToNumber:
 xor eax, eax
 jmp .first ; A one-time jump
.digit:
 inc esi
 lea eax, [rax + rax * 4] ; (Number * 5) * 2 + Newdigit
 lea eax, [rdx + rax * 2]
.first:
 movzx edx, byte [rsi]
 sub edx, '0'
 cmp dl, 10
 jb .digit ; The only jump in this tight loop
 ret

I verified that it is ok for the caller of this subroutine to have RDX trashed.
I removed support for the minus character based on the following quotes from the documentation:

... decimal (0-9) digits may be used to represent a non-negative, decimal constant in the range 0 through 32,767. The use of the minus sign to indicate a negative number is not allowed.

The A-instruction ... is the only means to introduce a (non-negative) numeric value into the computer under program control;

Avoid complicating matters

The setup that you do for rep movsb in _numberToString seems convoluted.
I could simplify the code mostly by anticipating (before the loop) the upcoming rep movsb that will require RCX, RSI, and RDI been setup. No need for an exchange between RSI and RDI, nor another addition of 5 to calculate RCX.

; given a 16-bit number in ax, convert to string in line
; ENTRY:
; eax : number to convert (high word is empty)
; rdi : points to destination buffer
; EXIT:
; rdi : points to past end of written string
; TRASHES:
; rax, rbx, rcx, rdx, rsi
_numberToString:
 lea esi, [rdi + 5] ; Past end of the 5-byte buffer
 mov ecx, esi
 mov ebx, 10
.loopy:
 dec esi
 xor edx, edx
 div ebx
 add edx, '0'
 mov [rsi], dl
 test eax, eax
 jnz .loopy
 sub ecx, esi
 rep movsb
 ret

Your use of test rax, rax made you insist on 'high 48 bits must be zero'. Writing test eax, eax not only shortens the code but also relaxes this condition to bits 16-31.
I verified that it is ok for the caller of this subroutine to have RAX, RBX, RCX, RDX, and RSI trashed.

Use a data-driven structure

I very much like what you did with this: the Operation struc and the opcode macro.
But seeing that no mnemonic string has more than 4 characters, why did you choose for a string of 8 instead of 4? Couldn't the program benefit from the increased data density? I rewrote the entire program applying everything from this review (and more that I didn't feel like posting about), and I got a 25% reduction of the .data section and a 19% reduction of the .code section.

Who is right and who is wrong?

The nand2tetris document that you link to uses AMD= and MD=, but the wikipedia page that I consulted uses ADM= and DM=. Whom shall I trust?

answered Jul 6, 2022 at 19:35
\$\endgroup\$
1
  • \$\begingroup\$ I appreciate the review! As for the last point: the advantage to the current scheme is that it would be simple to add the other encodings to the list and it would accept both but only emit the first matching entry. My advice is to trust no one (especially those who write false comments!) :) \$\endgroup\$ Commented Jul 6, 2022 at 19:58

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.