4
\$\begingroup\$

I made an assembler for the "hack" assembly language, here's an example of what it looks like:

 @16
 M=1
 @17
 M=0
 @16
 D=M
 @0
 D=D-M
 @18
 D;JGT
 @16
 D=M
 @17
 M=D+M
 @16
 M=M+1
 @4
 0;JMP
 @17
 D=M

Currently it only works with predefined symbols, I want to know what to improve before I continue.

So how it works is, is it reads the file line by line and first identifies if its a C or A instruction (there is also an L instruction which i haven't yet implemented). If its a C instruction, it gets the dest, comp and jump fields and converts them to the correct binary code.

If its an A instruction it gets the symbol of the instruction and converts it to the correct binary code.

Here are the opcodes: enter image description here

For now its just prints the binary code, it doesn't yet write to a file

I'm pretty embarrassed about the way I get the correct opcode for the instruction because it's basically just a bunch of if statements. So please tell me how to improve.

x86 NASM btw

section .bss 
 buffer resb 320
 line resb 10
 instructionT resb 1
 symbol resb 10
 comp resb 6
 jump resb 6
 dest resb 6
 op resb 64
 fileptr resb 64
 totalBytes resb 4
 bytenum resb 4
SYSREAD equ 0
SYSWRITE equ 1
SYSOPEN equ 2
A_INSTRUCTION equ 0
C_INSTRUCTION equ 1
L_INSTRUCTION equ 2
true equ 1
false equ 0
section .data
 filename db "file.asm", 0
 opcode db "0000000000000000"
 lastround db false
 newline db 10
 
section .text
 global _start 
 
 _start:
 
 openfile filename
 push rax ; file descriptor
 call readfile 
 main: 
 call getline
 
 push line
 call instructionType 
 cmp byte [instructionT], C_INSTRUCTION 
 jne a_instruction
 call getDetails ; get comp, dest and jump fields 
 mov edi, opcode
 mov al, "1" ; first three bytes are 111
 mov ecx, 3
 repe stosb
 inc edi 
 call compop ; opcode for comp field
 push op
 push 6
 call fill
 
 cmp byte [dest], 0 ; destination field is optional
 jz nodest
 call destop 
 push op
 push 3
 call fill 
 jmp jumpfield 
 nodest:
 add edi, 3 ; fill adds 3 to edi so do it here if fill isnt called
 jumpfield:
 cmp byte [jump], 0 ; jump field is optional
 jz cleanup
 call jumpop 
 push op
 push 3
 call fill
 
 cleanup: 
 push comp ; clears comp, dest and jump field
 push 0
 push 18 ; clear 18 bytes which is comp, dest and jump
 call clear
 jmp iteration
 a_instruction:
 call getSymbol ; for example @150 symbol = "150"
 stringToNumber symbol
 call binaryString
 push symbol ; clear symbol for next iteration
 push 0
 push 10
 call clear
 iteration: 
 print opcode, 16
 print newline, 1
 push opcode ; clear opcode for next iteration
 push "0"
 push 16
 call clear
 cmp byte [lastround], true ; true if end of file
 jne main
 
 theEnd:
 exit
 binaryString: ; convert A instruction to binary version
 mov eax, edi
 mov ebx, 2
 mov ecx, 15
 mov edi, opcode + 15
 binary:
 cdq
 div ebx
 add dl, 48
 mov byte [edi], dl
 dec edi
 loop binary
 ret
 
 instructionType:
 mov edi, line
 .whitespace:
 cmp byte [edi], 0
 jnz .out
 inc edi
 jmp .whitespace
 .out:
 
 cmp byte [edi], "@" ; @xxx = A_INSTRUCTION
 jne .around1
 mov byte [instructionT], A_INSTRUCTION
 jmp .end
 .around1: 
 cmp byte [edi], "(" ; (label) = L_INSTRUCTION
 jne .around2
 mov byte [instructionT], L_INSTRUCTION
 jmp .end
 .around2:
 mov byte [instructionT], C_INSTRUCTION ; default = C_INSTRUCTION
 .end:
 
 ret 
 
 getDetails: ; get dest, comp and jump for C instruction
 mov edi, line
 inc edi ; get first byte
 mov esi, edi
 
 xor eax, eax
 hasDest: ; if strings contains "=" it has the destination field
 ; ; example: A=M+1, destination = A
 mov ecx, 10
 mov al, "="
 repne scasb
 cmp byte [edi], 0 ; if no destination skip to the next loop
 jz getComp
 
 mov edi, dest
 getDest:
 movsb
 cmp byte [esi], "="
 jne getDest
 inc esi
 
 getComp:
 mov edi, comp
 .loop:
 movsb
 cmp byte [esi], ";"
 je getJump
 cmp byte [esi], 0
 jnz .loop
 jmp out
 getJump:
 inc esi
 mov edi, jump
 .loop:
 movsb
 cmp byte [esi], 0
 jnz .loop 
 out:
 ret
 getSymbol: 
 
 mov esi, line
 add esi, 2 ; skip @ or (
 mov edi, symbol
 
 .loop:
 cmp byte [esi], ")"
 je .end
 cmp byte [esi], 0
 je .end
 cmp byte [esi], 10
 je .end
 movsb
 jmp .loop 
 .end:
 ret
 
 getline:
 mov ebx, [totalBytes] ; needed for checking end of file
 mov esi, [fileptr]
 
 push line 
 push 0 
 push 10
 call clear ; clears the previous line
 sub edi, 9 ; get back to starting index
 whitespace:
 cmp byte [esi], 10
 je remove
 jmp move
 remove:
 inc esi
 inc byte [bytenum]
 jmp whitespace
 move:
 inc byte [bytenum]
 movsb 
 cmp [bytenum], ebx ; if (current byte == totalbytes) set lastround to true
 jnge .around
 mov byte [lastround], true
 jmp .out
 .around:
 cmp byte [esi], 10
 jne move
 .out:
 mov [fileptr], esi
 ret 
 readfile: ; reads 30 bytes at a time until end of file
 mov byte [totalBytes], 0
 
 mov ebx, buffer
 mov eax, SYSREAD
 mov edi, [rsp + 8] ; file descriptor
 mov esi, ebx ; buffer address
 mov edx, 30
 syscall
 mov qword [fileptr], buffer
 cmp eax, 0
 jz .out
 .readmore:
 add ebx, eax ; move buffer address the amount of bytes that were read
 add [totalBytes], eax
 mov eax, SYSREAD
 mov edi, [rsp + 8]
 mov esi, ebx 
 mov edx, 30
 syscall
 
 cmp eax, 0 ; read until end of file
 jnz .readmore 
 ret 
 
 .out:
 fill: ; fills the opcode string (edi has the opcode address) 
 mov esi, [rsp + 16] 
 mov ecx, [rsp + 8]
 repe movsb 
 ret 16 
 clear:
 mov edi, [rsp + 24] ; address to be cleared
 mov al, [rsp + 16] 
 mov ecx, [rsp + 8]
 repe stosb
 ret 24

here at the macros:

%macro print 2
 push rdi
 push rax
 push rdi
 push rdx
 push rsi
 push rcx
 mov esi, %1
 mov eax, SYSWRITE
 mov edi, 1
 mov edx, %2
 syscall
 pop rcx
 pop rsi
 pop rdx
 pop rdi
 pop rax
 pop rdi
%endmacro
%macro stringToNumber 1
 push rbp
 push rcx
 push rsi
 push rax
 mov rdi, 0 ; number stored here
 mov ebp, %1 
 mov ecx, 0 
 cmp byte [ebp], "-"
 jne %%loop
 inc ecx
 %%loop:
 xor esi, esi
 mov sil, byte [ebp + ecx]
 sub sil, 48
 cmp esi, 9 ; if this is greater than 9 the string has ended
 jg %%exit
 mov rax, 10
 mul rdi ; multiply by 10
 add rsi, rax
 mov rdi, rsi
 inc ecx
 jmp %%loop
 %%exit:
 cmp byte [ebp], "-"
 jne %%around
 neg rdi
 %%around:
 pop rax
 pop rsi
 pop rcx
 pop rbp
%endmacro
%macro openfile 1
 mov eax, SYSOPEN
 mov rdi, %1
 mov esi, 2
 mov edx, 0777
 syscall
%endmacro
%macro getStringLength 1
 mov edx, %1
 xor ecx, ecx ;counter
 %%loop:
 mov bl, [edx + ecx]
 inc ecx
 cmp bl, 0
 je %%exit
 cmp bl, 10
 je %%exit
 jmp %%loop
 %%exit:
 sub ecx, 1
%endmacro

And here is the code for getting the correct opcode for the C instruction, it is over 300 lines :(

section .data
 null db "null", 0
 M db "M", 0
 D db "D", 0
 DM db "DM", 0
 A db "A", 0
 AM DB "AM", 0
 AD db "AD", 0
 ADM db "ADM", 0
 MD db "MD", 0
 jJGT db "JGT", 0
 jJGE db "JGE", 0
 jJEQ db "JEQ", 0
 jJLT db "JLT", 0
 jJNE db "JNE", 0
 jJLE db "JLE", 0
 jJMP db "JMP", 0
 zero db "0", 0
 one db "1", 0
 negone db "-1", 0
 notD db "!D", 0
 notA db "!A", 0
 notM db "!M", 0
 minusD db "-D", 0
 minusA db "-A", 0
 minusM db "-M", 0
 dplus1 db "D+1", 0
 aplus1 db "A+1", 0
 mplus1 db "M+1", 0
 dminus1 db "D-1", 0
 aminus1 db "A-1", 0
 mminus1 db "M-1", 0
 dplusa db "D+A", 0
 dplusm db "D+M", 0
 dminusa db "D-A", 0
 dminusm db "D-M", 0
 aminusd db "A-D", 0
 mminusd db "M-D", 0
 danda db "D&A", 0
 dandm db "D&M", 0
 dora db "D|A", 0
 dorm db "D|M", 0
section .text
 
%macro strcmp 2
 push rdi
 mov edi, %1
 getStringLength edi
 mov esi, %2
 repe cmpsb
 pop rdi
%endmacro
 ; OPCODE FOR COMP
 compop:
 strcmp comp, zero
 jnz .n1
 mov rdx, "101010"
 mov byte [opcode + 3], "0"
 mov [op], rdx
 jmp finishcomp
 .n1:
 strcmp comp, one
 jnz .n2
 mov rdx, "111111"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n2:
 strcmp comp, negone
 jnz .n3
 mov rdx, "111010"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n3:
 strcmp comp, D 
 jnz .n4
 mov rdx, "001100" 
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n4:
 strcmp comp, A
 jnz .n5
 mov rdx, "110000"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n5:
 strcmp comp, M
 jnz .n6
 mov rdx, "110000"
 mov [op], rdx
 mov byte [opcode + 3], "1"
 jmp finishcomp
 .n6:
 strcmp comp, notD
 jnz .n7
 mov rdx, "001101"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n7:
 strcmp comp, notA
 jnz .n8
 mov rdx, "110001"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n8:
 strcmp comp, notM
 jnz .n9
 mov rdx, "110001"
 mov [op], rdx
 mov byte [opcode + 3], "1"
 jmp finishcomp
 .n9:
 strcmp comp, minusD
 jnz .n10
 mov rdx, "001111"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n10:
 strcmp comp, minusA
 jnz .n11
 mov rdx, "110011"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n11:
 strcmp comp, minusM
 jnz .n12
 mov rdx, "110011"
 mov [op], rdx
 mov byte [opcode + 3], "1"
 jmp finishcomp
 .n12:
 strcmp comp, dplus1
 jnz .n13
 mov rdx, "011111"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n13:
 strcmp comp, aplus1
 jnz .n14
 mov rdx, "110111"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n14:
 strcmp comp, mplus1
 jnz .n15
 mov rdx, "110111"
 mov [op], rdx
 mov byte [opcode + 3], "1"
 jmp finishcomp
 .n15:
 strcmp comp, dminus1
 jnz .n16 
 mov rdx, "001110"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n16:
 strcmp comp, aminus1
 jnz .n17
 mov rdx, "110010"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n17:
 strcmp comp, mminus1
 jnz .n18
 mov rdx, "110010"
 mov [op], rdx
 mov byte [opcode + 3], "1"
 jmp finishcomp
 .n18:
 strcmp comp, dplusa
 jnz .n19
 mov rdx, "000010"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n19:
 strcmp comp, dplusm
 jnz .n20
 mov rdx, "000010"
 mov [op], rdx
 mov byte [opcode + 3], "1"
 jmp finishcomp
 .n20:
 strcmp comp, dminusa
 jnz .n21
 mov rdx, "010011"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n21:
 strcmp comp, dminusm
 jnz .n22
 mov rdx, "010011"
 mov [op], rdx
 mov byte [opcode + 3], "1"
 jmp finishcomp
 .n22:
 strcmp comp, aminusd
 jnz .n23
 mov rdx, "000111"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n23:
 strcmp comp, mminusd
 jnz .n24
 mov rdx, "000111"
 mov [op], rdx
 mov byte [opcode + 3], "1"
 jmp finishcomp
 .n24:
 strcmp comp, danda
 jnz .n25
 mov rdx, "000000"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n25:
 strcmp comp, dandm
 jnz .n26
 mov rdx, "000000"
 mov [op], rdx
 mov byte [opcode + 3], "1"
 jmp finishcomp
 .n26:
 strcmp comp, dora
 jnz .n27
 mov rdx, "010101"
 mov [op], rdx
 mov byte [opcode + 3], "0"
 jmp finishcomp
 .n27:
 strcmp comp, dorm
 jnz finishcomp
 mov rdx, "010101"
 mov [op], rdx
 mov byte [opcode + 3], "1"
 finishcomp:
 ret
 ; OPCODE FOR DESTINATION
 destop:
 
 strcmp dest, null
 jnz n1
 mov dword [op], "000"
 jmp finish
 n1:
 strcmp dest, D
 jnz n2
 mov dword [op], "010"
 jmp finish
 n2:
 strcmp dest, M
 jnz n3
 mov dword [op], "001"
 jmp finish
 n3:
 strcmp dest, A 
 jnz n4
 mov dword [op], "100"
 jmp finish
 n4:
 strcmp dest, DM
 jnz n5
 mov dword [op], "011"
 jmp finish
 n5:
 strcmp dest, AM
 jnz n6
 mov dword [op], "101"
 jmp finish
 n6:
 strcmp dest, AD
 jnz n7
 mov dword [op], "110"
 jmp finish
 n7:
 strcmp dest, ADM
 jnz n8
 mov dword [op], "111"
 n8:
 strcmp dest, MD
 jnz finish
 mov dword [op], "011"
 finish:
 
 ret
 ; OPCODE FOR JUMP FIELD
 jumpop:
 mov byte [jump + 3], 0
 strcmp jump, jJGE
 jnz .n1
 mov dword [op], "011"
 jmp endjump
 .n1:
 strcmp jump, null
 jnz .n2
 mov dword [op], "000"
 jmp endjump
 .n2:
 strcmp jump, jJGT
 jnz .n3
 mov dword [op], "001"
 jmp endjump
 .n3:
 strcmp jump, jJLT
 jnz .n4
 mov dword [op], "100"
 jmp endjump
 .n4:
 strcmp jump, jJNE
 jnz .n5
 mov dword [op], "101"
 jmp endjump
 .n5:
 strcmp jump, jJLE
 jnz .n6
 mov dword [op], "110"
 jmp endjump
 .n6:
 strcmp jump, jJMP
 jnz endjump
 mov dword [op], "111"
 endjump:
 
 ret```
Edward
67.2k4 gold badges120 silver badges284 bronze badges
asked Jun 23, 2022 at 10:48
\$\endgroup\$
6
  • \$\begingroup\$ See codereview.stackexchange.com/questions/277664/… for an alternative version. \$\endgroup\$ Commented Jun 27, 2022 at 16:10
  • \$\begingroup\$ @Edward in your version you used registers to pass parameters, is that best practice instead of using the stack to pass parameters to procedures? \$\endgroup\$ Commented Jun 27, 2022 at 17:06
  • \$\begingroup\$ It's generally faster to use registers vs. stack. It's one reason all of the Linux syscalls use registers to pass parameters. \$\endgroup\$ Commented Jun 27, 2022 at 17:22
  • \$\begingroup\$ why does C code that compiles into assembly use the stack to pass parameters, why not the registers if they are faster? \$\endgroup\$ Commented Jun 27, 2022 at 18:27
  • \$\begingroup\$ It doesn't: godbolt.org/z/bvqGEn3az \$\endgroup\$ Commented Jun 27, 2022 at 19:31

1 Answer 1

3
\$\begingroup\$

Here are some things that may help you improve your program.

Make sure your code is complete

If you're going to ask for a code review, it should be your complete, best code to be respectful of the time reviewers spend looking it over. This code was incomplete and was missing a definition for exit. I can and did guess what it was supposed to be, but it's better if your code is actually complete.

Avoid the use of "magic numbers"

This code is littered with "magic numbers," that is, unnamed constants such as 0777, 30, 18, etc. Generally it's better to avoid that and give such constants meaningful names. That way, if anything ever needs to be changed, you won't have to go hunting through the code for all instances of "30" and then trying to determine if this particular 30 means the length of the size of the chunks read by readfile or some other constant that happens to have the same value.

Add comments

There are very few comments in this program and lots of "magic numbers" making it much harder to read than it should be. The use of macros helps, however, and is a good practice I would encourage you to continue.

Fix the bugs

There are at least two bugs in this program. The first is that if a line begins with leading whitespace, the program fails (in my case, it segfaulted and crashed).

Add error handling

If the hardcoded filename is a nonexistent file (or doesn't have read permissions, etc.) the file open call will fail and an error number will be returned in rax. You should check the return values generally to see if they have failed and handle the error appropriately.

Think carefully about jumps

A jump in an x86_64 processor is a relatively costly operation; it messes with cacheing and speculative execution that modern processes rely on. So figuring out how to minimize jumps by rearranging code is often worthwhile. Here are two examples:

In the getStringLength macro, we have these lines:

%%loop:
 ; some instructions
 cmp bl, 10
 je %%exit
 jmp %%loop
%%exit:

I'd rewrite that like this:

%%loop:
 ; some instructions
 cmp bl, 10
 jne %%loop
%%exit:

A little more complex case is this code:

whitespace:
 cmp byte [esi], 10
 je remove
 jmp move
remove:
 inc esi
 inc byte [bytenum]
 jmp whitespace
move:

As above, whenever you have a conditional jump followed by an unconditional jump, something is probably not quite right, especially when all that the conditional jump does is jump over the unconditional jump! So the first rewrite might look like this:

whitespace:
 cmp byte [esi], 10
 jne move
 inc esi
 inc byte [bytenum]
 jmp whitespace
move:

However, since this is a tight loop, I'd be inclined to go one step further:

 jmp whitespace
looptop:
 inc esi
 inc byte [bytenum]
whitespace:
 cmp byte [esi], 10
 je looptop
move:

In this case, we now only execute the unconditional jmp once, and the single conditional jump only as many times as we iterate through the loop.

Document register use

The best assembly language programmers keep close track of register usage to maximize their use and to minimize needless moves, pushes and pops. Comments will help you; I often write comment blocks above labeled lines (jump targets) that tell what I'm expecting to be in each register and on the stack. It really helps debugging code like this. Ideally, you'd assign some purpose to each register and then use them only for that purpose in the entire rest of the code.

Document macro parameters

In what I believe was a design error, nasm's macros use numbered rather than named parameters. For that reason, we have to refer to parameters as %1 and %2 rather than, say, buffer and bufferlength. To make up for this deficit, documenting macro parameters in the form of comments is vital for others to be able to understand your program.

Use efficient instructions

The code currently includes this routine:

 binaryStringn: ; convert A instruction to binary version
 mov eax, edi
 mov ebx, 2
 mov ecx, 15
 mov edi, opcode + 15
 binary:
 cdq
 div ebx
 add dl, 48
 mov byte [edi], dl
 dec edi
 loop binary
 ret

The div instruction is relatively slow, but it can easily be avoided. In this case, we're converting to binary, so we could just shift the bits instead, which is much more efficient:

 ; convert a 16-bit number in edi to an ASCII representation of binary
 ; and store it at opcode
 ;
 ; edi - input number 
 ; trashed: eax, ecx, edx
 binaryString: ; convert A instruction to binary version
 mov edx, edi
 mov ecx, 15
 binary:
 xor eax,eax
 shr rdx,1
 adc al, '0'
 mov byte [opcode + ecx], al
 loop binary
 ret
 

As a further enhancement, this could easily be made into a more general purpose function by allowing the user to pass in both the destination address and the desired number of bits to print.

Don't hardcode file names

Insisting that the input file is named "file.asm" is not very user friendly. Instead, I'd recommend adding code to fetch the file name from the command line.

Use a data-driven structure

Right now the printable codes and their constituent parts are scattered in multiple different places. I would suggest that defining a struc would make it easier to keep such things together, making it much easier to write, understand, debug and expand. Instead of an unmaintainable long if statement, you could simply have short code that searches a table of structures for a value. Even better, you could also use the same table for disassembly, should you decide to write one.

Rethink the structure

A traditional assembler typically has a tokenizer that turns the input symbols into tokens, and then a parser to evaluate the tokens according to some predefined acceptable grammar. Using a structure like that would make your code shorter and easier to understand.

Consider unit tests

I would recommend refactoring this into separate modules which could then be linked to test code (or combined into your assembler). By providing not only modules but also the code by which they can be tested, the program becomes much more durable and easier to maintain. You could write the tests in either assembly language or C or C++.

Think of the user

If there is a problem with the input file, you'd probably prefer that the assembler prints some kind of diagnostic message rather than just unceremoniously and silently crashing. Add such diagnostic messages will save you a lot of time in the long run, both as you expand this program, but also as you use it.

answered Jun 23, 2022 at 18:00
\$\endgroup\$
7
  • \$\begingroup\$ how would you search for the correct opcode tho? A linear search would be slow \$\endgroup\$ Commented Jun 23, 2022 at 20:05
  • \$\begingroup\$ I would propose making three tables: one each for the destination, operation and jump. Since they're all short, linear search would not be that slow. Or one could sort the entries and easily convert to a binary search. \$\endgroup\$ Commented Jun 23, 2022 at 20:23
  • \$\begingroup\$ I don't understand how this could be implemented with a binary search. Could you please explain? \$\endgroup\$ Commented Jun 23, 2022 at 20:27
  • \$\begingroup\$ If you were manually sorting through a list of names in an alphabetically sorted list, you might do this: 1) set the beginning to the first entry, 2) set the ending to the last entry, 3) middle = (last-first)/2, 4) if the middle entry matches, you're done 5) if your entry is higher, set beginning to middle+1, else set ending to middle-1 6) go to step 3 \$\endgroup\$ Commented Jun 23, 2022 at 20:33
  • \$\begingroup\$ The computer can do the exact same thing. \$\endgroup\$ Commented Jun 23, 2022 at 20:34

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.