I made an assembler for the "hack" assembly language, here's an example of what it looks like:
@16
M=1
@17
M=0
@16
D=M
@0
D=D-M
@18
D;JGT
@16
D=M
@17
M=D+M
@16
M=M+1
@4
0;JMP
@17
D=M
Currently it only works with predefined symbols, I want to know what to improve before I continue.
So how it works is, is it reads the file line by line and first identifies if its a C or A instruction (there is also an L instruction which i haven't yet implemented). If its a C instruction, it gets the dest, comp and jump fields and converts them to the correct binary code.
If its an A instruction it gets the symbol of the instruction and converts it to the correct binary code.
Here are the opcodes: enter image description here
For now its just prints the binary code, it doesn't yet write to a file
I'm pretty embarrassed about the way I get the correct opcode for the instruction because it's basically just a bunch of if
statements. So please tell me how to improve.
x86 NASM btw
section .bss
buffer resb 320
line resb 10
instructionT resb 1
symbol resb 10
comp resb 6
jump resb 6
dest resb 6
op resb 64
fileptr resb 64
totalBytes resb 4
bytenum resb 4
SYSREAD equ 0
SYSWRITE equ 1
SYSOPEN equ 2
A_INSTRUCTION equ 0
C_INSTRUCTION equ 1
L_INSTRUCTION equ 2
true equ 1
false equ 0
section .data
filename db "file.asm", 0
opcode db "0000000000000000"
lastround db false
newline db 10
section .text
global _start
_start:
openfile filename
push rax ; file descriptor
call readfile
main:
call getline
push line
call instructionType
cmp byte [instructionT], C_INSTRUCTION
jne a_instruction
call getDetails ; get comp, dest and jump fields
mov edi, opcode
mov al, "1" ; first three bytes are 111
mov ecx, 3
repe stosb
inc edi
call compop ; opcode for comp field
push op
push 6
call fill
cmp byte [dest], 0 ; destination field is optional
jz nodest
call destop
push op
push 3
call fill
jmp jumpfield
nodest:
add edi, 3 ; fill adds 3 to edi so do it here if fill isnt called
jumpfield:
cmp byte [jump], 0 ; jump field is optional
jz cleanup
call jumpop
push op
push 3
call fill
cleanup:
push comp ; clears comp, dest and jump field
push 0
push 18 ; clear 18 bytes which is comp, dest and jump
call clear
jmp iteration
a_instruction:
call getSymbol ; for example @150 symbol = "150"
stringToNumber symbol
call binaryString
push symbol ; clear symbol for next iteration
push 0
push 10
call clear
iteration:
print opcode, 16
print newline, 1
push opcode ; clear opcode for next iteration
push "0"
push 16
call clear
cmp byte [lastround], true ; true if end of file
jne main
theEnd:
exit
binaryString: ; convert A instruction to binary version
mov eax, edi
mov ebx, 2
mov ecx, 15
mov edi, opcode + 15
binary:
cdq
div ebx
add dl, 48
mov byte [edi], dl
dec edi
loop binary
ret
instructionType:
mov edi, line
.whitespace:
cmp byte [edi], 0
jnz .out
inc edi
jmp .whitespace
.out:
cmp byte [edi], "@" ; @xxx = A_INSTRUCTION
jne .around1
mov byte [instructionT], A_INSTRUCTION
jmp .end
.around1:
cmp byte [edi], "(" ; (label) = L_INSTRUCTION
jne .around2
mov byte [instructionT], L_INSTRUCTION
jmp .end
.around2:
mov byte [instructionT], C_INSTRUCTION ; default = C_INSTRUCTION
.end:
ret
getDetails: ; get dest, comp and jump for C instruction
mov edi, line
inc edi ; get first byte
mov esi, edi
xor eax, eax
hasDest: ; if strings contains "=" it has the destination field
; ; example: A=M+1, destination = A
mov ecx, 10
mov al, "="
repne scasb
cmp byte [edi], 0 ; if no destination skip to the next loop
jz getComp
mov edi, dest
getDest:
movsb
cmp byte [esi], "="
jne getDest
inc esi
getComp:
mov edi, comp
.loop:
movsb
cmp byte [esi], ";"
je getJump
cmp byte [esi], 0
jnz .loop
jmp out
getJump:
inc esi
mov edi, jump
.loop:
movsb
cmp byte [esi], 0
jnz .loop
out:
ret
getSymbol:
mov esi, line
add esi, 2 ; skip @ or (
mov edi, symbol
.loop:
cmp byte [esi], ")"
je .end
cmp byte [esi], 0
je .end
cmp byte [esi], 10
je .end
movsb
jmp .loop
.end:
ret
getline:
mov ebx, [totalBytes] ; needed for checking end of file
mov esi, [fileptr]
push line
push 0
push 10
call clear ; clears the previous line
sub edi, 9 ; get back to starting index
whitespace:
cmp byte [esi], 10
je remove
jmp move
remove:
inc esi
inc byte [bytenum]
jmp whitespace
move:
inc byte [bytenum]
movsb
cmp [bytenum], ebx ; if (current byte == totalbytes) set lastround to true
jnge .around
mov byte [lastround], true
jmp .out
.around:
cmp byte [esi], 10
jne move
.out:
mov [fileptr], esi
ret
readfile: ; reads 30 bytes at a time until end of file
mov byte [totalBytes], 0
mov ebx, buffer
mov eax, SYSREAD
mov edi, [rsp + 8] ; file descriptor
mov esi, ebx ; buffer address
mov edx, 30
syscall
mov qword [fileptr], buffer
cmp eax, 0
jz .out
.readmore:
add ebx, eax ; move buffer address the amount of bytes that were read
add [totalBytes], eax
mov eax, SYSREAD
mov edi, [rsp + 8]
mov esi, ebx
mov edx, 30
syscall
cmp eax, 0 ; read until end of file
jnz .readmore
ret
.out:
fill: ; fills the opcode string (edi has the opcode address)
mov esi, [rsp + 16]
mov ecx, [rsp + 8]
repe movsb
ret 16
clear:
mov edi, [rsp + 24] ; address to be cleared
mov al, [rsp + 16]
mov ecx, [rsp + 8]
repe stosb
ret 24
here at the macros:
%macro print 2
push rdi
push rax
push rdi
push rdx
push rsi
push rcx
mov esi, %1
mov eax, SYSWRITE
mov edi, 1
mov edx, %2
syscall
pop rcx
pop rsi
pop rdx
pop rdi
pop rax
pop rdi
%endmacro
%macro stringToNumber 1
push rbp
push rcx
push rsi
push rax
mov rdi, 0 ; number stored here
mov ebp, %1
mov ecx, 0
cmp byte [ebp], "-"
jne %%loop
inc ecx
%%loop:
xor esi, esi
mov sil, byte [ebp + ecx]
sub sil, 48
cmp esi, 9 ; if this is greater than 9 the string has ended
jg %%exit
mov rax, 10
mul rdi ; multiply by 10
add rsi, rax
mov rdi, rsi
inc ecx
jmp %%loop
%%exit:
cmp byte [ebp], "-"
jne %%around
neg rdi
%%around:
pop rax
pop rsi
pop rcx
pop rbp
%endmacro
%macro openfile 1
mov eax, SYSOPEN
mov rdi, %1
mov esi, 2
mov edx, 0777
syscall
%endmacro
%macro getStringLength 1
mov edx, %1
xor ecx, ecx ;counter
%%loop:
mov bl, [edx + ecx]
inc ecx
cmp bl, 0
je %%exit
cmp bl, 10
je %%exit
jmp %%loop
%%exit:
sub ecx, 1
%endmacro
And here is the code for getting the correct opcode for the C instruction, it is over 300 lines :(
section .data
null db "null", 0
M db "M", 0
D db "D", 0
DM db "DM", 0
A db "A", 0
AM DB "AM", 0
AD db "AD", 0
ADM db "ADM", 0
MD db "MD", 0
jJGT db "JGT", 0
jJGE db "JGE", 0
jJEQ db "JEQ", 0
jJLT db "JLT", 0
jJNE db "JNE", 0
jJLE db "JLE", 0
jJMP db "JMP", 0
zero db "0", 0
one db "1", 0
negone db "-1", 0
notD db "!D", 0
notA db "!A", 0
notM db "!M", 0
minusD db "-D", 0
minusA db "-A", 0
minusM db "-M", 0
dplus1 db "D+1", 0
aplus1 db "A+1", 0
mplus1 db "M+1", 0
dminus1 db "D-1", 0
aminus1 db "A-1", 0
mminus1 db "M-1", 0
dplusa db "D+A", 0
dplusm db "D+M", 0
dminusa db "D-A", 0
dminusm db "D-M", 0
aminusd db "A-D", 0
mminusd db "M-D", 0
danda db "D&A", 0
dandm db "D&M", 0
dora db "D|A", 0
dorm db "D|M", 0
section .text
%macro strcmp 2
push rdi
mov edi, %1
getStringLength edi
mov esi, %2
repe cmpsb
pop rdi
%endmacro
; OPCODE FOR COMP
compop:
strcmp comp, zero
jnz .n1
mov rdx, "101010"
mov byte [opcode + 3], "0"
mov [op], rdx
jmp finishcomp
.n1:
strcmp comp, one
jnz .n2
mov rdx, "111111"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n2:
strcmp comp, negone
jnz .n3
mov rdx, "111010"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n3:
strcmp comp, D
jnz .n4
mov rdx, "001100"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n4:
strcmp comp, A
jnz .n5
mov rdx, "110000"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n5:
strcmp comp, M
jnz .n6
mov rdx, "110000"
mov [op], rdx
mov byte [opcode + 3], "1"
jmp finishcomp
.n6:
strcmp comp, notD
jnz .n7
mov rdx, "001101"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n7:
strcmp comp, notA
jnz .n8
mov rdx, "110001"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n8:
strcmp comp, notM
jnz .n9
mov rdx, "110001"
mov [op], rdx
mov byte [opcode + 3], "1"
jmp finishcomp
.n9:
strcmp comp, minusD
jnz .n10
mov rdx, "001111"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n10:
strcmp comp, minusA
jnz .n11
mov rdx, "110011"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n11:
strcmp comp, minusM
jnz .n12
mov rdx, "110011"
mov [op], rdx
mov byte [opcode + 3], "1"
jmp finishcomp
.n12:
strcmp comp, dplus1
jnz .n13
mov rdx, "011111"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n13:
strcmp comp, aplus1
jnz .n14
mov rdx, "110111"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n14:
strcmp comp, mplus1
jnz .n15
mov rdx, "110111"
mov [op], rdx
mov byte [opcode + 3], "1"
jmp finishcomp
.n15:
strcmp comp, dminus1
jnz .n16
mov rdx, "001110"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n16:
strcmp comp, aminus1
jnz .n17
mov rdx, "110010"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n17:
strcmp comp, mminus1
jnz .n18
mov rdx, "110010"
mov [op], rdx
mov byte [opcode + 3], "1"
jmp finishcomp
.n18:
strcmp comp, dplusa
jnz .n19
mov rdx, "000010"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n19:
strcmp comp, dplusm
jnz .n20
mov rdx, "000010"
mov [op], rdx
mov byte [opcode + 3], "1"
jmp finishcomp
.n20:
strcmp comp, dminusa
jnz .n21
mov rdx, "010011"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n21:
strcmp comp, dminusm
jnz .n22
mov rdx, "010011"
mov [op], rdx
mov byte [opcode + 3], "1"
jmp finishcomp
.n22:
strcmp comp, aminusd
jnz .n23
mov rdx, "000111"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n23:
strcmp comp, mminusd
jnz .n24
mov rdx, "000111"
mov [op], rdx
mov byte [opcode + 3], "1"
jmp finishcomp
.n24:
strcmp comp, danda
jnz .n25
mov rdx, "000000"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n25:
strcmp comp, dandm
jnz .n26
mov rdx, "000000"
mov [op], rdx
mov byte [opcode + 3], "1"
jmp finishcomp
.n26:
strcmp comp, dora
jnz .n27
mov rdx, "010101"
mov [op], rdx
mov byte [opcode + 3], "0"
jmp finishcomp
.n27:
strcmp comp, dorm
jnz finishcomp
mov rdx, "010101"
mov [op], rdx
mov byte [opcode + 3], "1"
finishcomp:
ret
; OPCODE FOR DESTINATION
destop:
strcmp dest, null
jnz n1
mov dword [op], "000"
jmp finish
n1:
strcmp dest, D
jnz n2
mov dword [op], "010"
jmp finish
n2:
strcmp dest, M
jnz n3
mov dword [op], "001"
jmp finish
n3:
strcmp dest, A
jnz n4
mov dword [op], "100"
jmp finish
n4:
strcmp dest, DM
jnz n5
mov dword [op], "011"
jmp finish
n5:
strcmp dest, AM
jnz n6
mov dword [op], "101"
jmp finish
n6:
strcmp dest, AD
jnz n7
mov dword [op], "110"
jmp finish
n7:
strcmp dest, ADM
jnz n8
mov dword [op], "111"
n8:
strcmp dest, MD
jnz finish
mov dword [op], "011"
finish:
ret
; OPCODE FOR JUMP FIELD
jumpop:
mov byte [jump + 3], 0
strcmp jump, jJGE
jnz .n1
mov dword [op], "011"
jmp endjump
.n1:
strcmp jump, null
jnz .n2
mov dword [op], "000"
jmp endjump
.n2:
strcmp jump, jJGT
jnz .n3
mov dword [op], "001"
jmp endjump
.n3:
strcmp jump, jJLT
jnz .n4
mov dword [op], "100"
jmp endjump
.n4:
strcmp jump, jJNE
jnz .n5
mov dword [op], "101"
jmp endjump
.n5:
strcmp jump, jJLE
jnz .n6
mov dword [op], "110"
jmp endjump
.n6:
strcmp jump, jJMP
jnz endjump
mov dword [op], "111"
endjump:
ret```
-
\$\begingroup\$ See codereview.stackexchange.com/questions/277664/… for an alternative version. \$\endgroup\$Edward– Edward2022年06月27日 16:10:20 +00:00Commented Jun 27, 2022 at 16:10
-
\$\begingroup\$ @Edward in your version you used registers to pass parameters, is that best practice instead of using the stack to pass parameters to procedures? \$\endgroup\$elonma1234– elonma12342022年06月27日 17:06:06 +00:00Commented Jun 27, 2022 at 17:06
-
\$\begingroup\$ It's generally faster to use registers vs. stack. It's one reason all of the Linux syscalls use registers to pass parameters. \$\endgroup\$Edward– Edward2022年06月27日 17:22:43 +00:00Commented Jun 27, 2022 at 17:22
-
\$\begingroup\$ why does C code that compiles into assembly use the stack to pass parameters, why not the registers if they are faster? \$\endgroup\$elonma1234– elonma12342022年06月27日 18:27:32 +00:00Commented Jun 27, 2022 at 18:27
-
\$\begingroup\$ It doesn't: godbolt.org/z/bvqGEn3az \$\endgroup\$Edward– Edward2022年06月27日 19:31:57 +00:00Commented Jun 27, 2022 at 19:31
1 Answer 1
Here are some things that may help you improve your program.
Make sure your code is complete
If you're going to ask for a code review, it should be your complete, best code to be respectful of the time reviewers spend looking it over. This code was incomplete and was missing a definition for exit
. I can and did guess what it was supposed to be, but it's better if your code is actually complete.
Avoid the use of "magic numbers"
This code is littered with "magic numbers," that is, unnamed constants such as 0777, 30, 18, etc. Generally it's better to avoid that and give such constants meaningful names. That way, if anything ever needs to be changed, you won't have to go hunting through the code for all instances of "30" and then trying to determine if this particular 30 means the length of the size of the chunks read by readfile
or some other constant that happens to have the same value.
Add comments
There are very few comments in this program and lots of "magic numbers" making it much harder to read than it should be. The use of macros helps, however, and is a good practice I would encourage you to continue.
Fix the bugs
There are at least two bugs in this program. The first is that if a line begins with leading whitespace, the program fails (in my case, it segfaulted and crashed).
Add error handling
If the hardcoded filename is a nonexistent file (or doesn't have read permissions, etc.) the file open call will fail and an error number will be returned in rax. You should check the return values generally to see if they have failed and handle the error appropriately.
Think carefully about jumps
A jump in an x86_64 processor is a relatively costly operation; it messes with cacheing and speculative execution that modern processes rely on. So figuring out how to minimize jumps by rearranging code is often worthwhile. Here are two examples:
In the getStringLength
macro, we have these lines:
%%loop:
; some instructions
cmp bl, 10
je %%exit
jmp %%loop
%%exit:
I'd rewrite that like this:
%%loop:
; some instructions
cmp bl, 10
jne %%loop
%%exit:
A little more complex case is this code:
whitespace:
cmp byte [esi], 10
je remove
jmp move
remove:
inc esi
inc byte [bytenum]
jmp whitespace
move:
As above, whenever you have a conditional jump followed by an unconditional jump, something is probably not quite right, especially when all that the conditional jump does is jump over the unconditional jump! So the first rewrite might look like this:
whitespace:
cmp byte [esi], 10
jne move
inc esi
inc byte [bytenum]
jmp whitespace
move:
However, since this is a tight loop, I'd be inclined to go one step further:
jmp whitespace
looptop:
inc esi
inc byte [bytenum]
whitespace:
cmp byte [esi], 10
je looptop
move:
In this case, we now only execute the unconditional jmp once, and the single conditional jump only as many times as we iterate through the loop.
Document register use
The best assembly language programmers keep close track of register usage to maximize their use and to minimize needless moves, pushes and pops. Comments will help you; I often write comment blocks above labeled lines (jump targets) that tell what I'm expecting to be in each register and on the stack. It really helps debugging code like this. Ideally, you'd assign some purpose to each register and then use them only for that purpose in the entire rest of the code.
Document macro parameters
In what I believe was a design error, nasm's macros use numbered rather than named parameters. For that reason, we have to refer to parameters as %1
and %2
rather than, say, buffer
and bufferlength
. To make up for this deficit, documenting macro parameters in the form of comments is vital for others to be able to understand your program.
Use efficient instructions
The code currently includes this routine:
binaryStringn: ; convert A instruction to binary version
mov eax, edi
mov ebx, 2
mov ecx, 15
mov edi, opcode + 15
binary:
cdq
div ebx
add dl, 48
mov byte [edi], dl
dec edi
loop binary
ret
The div
instruction is relatively slow, but it can easily be avoided. In this case, we're converting to binary, so we could just shift the bits instead, which is much more efficient:
; convert a 16-bit number in edi to an ASCII representation of binary
; and store it at opcode
;
; edi - input number
; trashed: eax, ecx, edx
binaryString: ; convert A instruction to binary version
mov edx, edi
mov ecx, 15
binary:
xor eax,eax
shr rdx,1
adc al, '0'
mov byte [opcode + ecx], al
loop binary
ret
As a further enhancement, this could easily be made into a more general purpose function by allowing the user to pass in both the destination address and the desired number of bits to print.
Don't hardcode file names
Insisting that the input file is named "file.asm" is not very user friendly. Instead, I'd recommend adding code to fetch the file name from the command line.
Use a data-driven structure
Right now the printable codes and their constituent parts are scattered in multiple different places. I would suggest that defining a struc
would make it easier to keep such things together, making it much easier to write, understand, debug and expand. Instead of an unmaintainable long if
statement, you could simply have short code that searches a table of structures for a value. Even better, you could also use the same table for disassembly, should you decide to write one.
Rethink the structure
A traditional assembler typically has a tokenizer that turns the input symbols into tokens, and then a parser to evaluate the tokens according to some predefined acceptable grammar. Using a structure like that would make your code shorter and easier to understand.
Consider unit tests
I would recommend refactoring this into separate modules which could then be linked to test code (or combined into your assembler). By providing not only modules but also the code by which they can be tested, the program becomes much more durable and easier to maintain. You could write the tests in either assembly language or C or C++.
Think of the user
If there is a problem with the input file, you'd probably prefer that the assembler prints some kind of diagnostic message rather than just unceremoniously and silently crashing. Add such diagnostic messages will save you a lot of time in the long run, both as you expand this program, but also as you use it.
-
\$\begingroup\$ how would you search for the correct opcode tho? A linear search would be slow \$\endgroup\$elonma1234– elonma12342022年06月23日 20:05:42 +00:00Commented Jun 23, 2022 at 20:05
-
\$\begingroup\$ I would propose making three tables: one each for the destination, operation and jump. Since they're all short, linear search would not be that slow. Or one could sort the entries and easily convert to a binary search. \$\endgroup\$Edward– Edward2022年06月23日 20:23:21 +00:00Commented Jun 23, 2022 at 20:23
-
\$\begingroup\$ I don't understand how this could be implemented with a binary search. Could you please explain? \$\endgroup\$elonma1234– elonma12342022年06月23日 20:27:26 +00:00Commented Jun 23, 2022 at 20:27
-
\$\begingroup\$ If you were manually sorting through a list of names in an alphabetically sorted list, you might do this: 1) set the beginning to the first entry, 2) set the ending to the last entry, 3) middle = (last-first)/2, 4) if the middle entry matches, you're done 5) if your entry is higher, set beginning to middle+1, else set ending to middle-1 6) go to step 3 \$\endgroup\$Edward– Edward2022年06月23日 20:33:48 +00:00Commented Jun 23, 2022 at 20:33
-
\$\begingroup\$ The computer can do the exact same thing. \$\endgroup\$Edward– Edward2022年06月23日 20:34:00 +00:00Commented Jun 23, 2022 at 20:34