I have written a program in x86 assembly (Intel syntax/MASM) that interprets brainfuck code that is fed to it via an interactive console and prints the final stack to stdout. Note that it does not include an implementation for the ,
command, but everything else has been implemented. The idea is that it presents the user with a prompt where they can enter their code; when the user hits Enter, it evaluates it and then dumps the resultant cell state to the console. While the cell state is maintained between prompt entries, the pointer position is not.
It looks like this:
$++++++++[>++++[>++>+++>+++>+<<<<-]>+>+>->>+[<]<-]>>.>---.+++++++..+++.>>.<-.<.+++.------.--------.>>+.>++.
Hello World!
0 0 72 100 87 33 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
--------------------------------------------------
$>>[.>]
HdW!
0 0 72 100 87 33 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
--------------------------------------------------
$++++++++[>++++++++<-]>+.
A
0 65 72 100 87 33 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
--------------------------------------------------
This is my first major foray into assembly, so I'm mainly looking for general tips, such as:
- Which registers I should be using in particular cases
- When I should be using RAM and when I should be using registers
- Ways that the code could be simplified
.386
.model flat,stdcall
.stack 4096
include \masm32\include\masm32.inc
includelib \masm32\lib\masm32.lib
ExitProcess proto,dwExitCode:dword
.data
bfsrc BYTE 200 dup(0) ; buffer to store source code
bfcells BYTE 100 dup(0) ; 100-byte data array size for now
loopStack DD 5 dup(0) ; stores the position of the first instruction in the current loop. Maximum of 5 nested loops.
charBuf BYTE 5 dup(0) ; buffer for when we are dumping numbers
newline BYTE 10,0 ; ASCII 10 is \n
prompt BYTE "$",0 ; input prompt string
hr BYTE 50 dup('-'),0 ; fake horizontal rule
space BYTE ' ',0
.code
EvalBf proc
start:
; print the prompt and then read input into the source array
invoke StdOut, addr prompt
invoke StdIn, addr bfsrc,200
; exit if input is empty
cmp bfsrc,0
je exit
mov eax,0 ; eax is BF data pointer
mov ebx,0 ; ebx is BF source pointer
mov ecx,0 ; ecx is loop depth
processInstruction:
; jump according to current source char
cmp BYTE PTR bfsrc[ebx], '+'
je plus
cmp BYTE PTR bfsrc[ebx], '-'
je minus
cmp BYTE PTR bfsrc[ebx], '>'
je fwd
cmp BYTE PTR bfsrc[ebx], '<'
je back
cmp BYTE PTR bfsrc[ebx], '['
je open
cmp BYTE PTR bfsrc[ebx], ']'
je close
cmp BYTE PTR bfsrc[ebx], '.'
je dot
; By default, skip instruction if we haven't caught it
jmp processNextInstruction
plus:
inc BYTE PTR bfcells[eax]
jmp processNextInstruction
minus:
dec BYTE PTR bfcells[eax]
jmp processNextInstruction
fwd:
inc eax
jmp processNextInstruction
back:
dec eax
jmp processNextInstruction
open:
; push the current source position
; onto the loop stack
mov loopStack[ecx*4],ebx
inc ecx
jmp processNextInstruction
close:
dec ecx
cmp BYTE PTR bfcells[eax], 0
; break out of loop if data cell is 0
je processNextInstruction
; pop the innermost loop position and
; set it as the next instruction
mov ebx,loopStack[ecx*4]
inc ecx
jmp processNextInstruction
dot:
; transfer current cell value into char buffer through dl
mov dl, BYTE PTR bfcells[eax]
mov BYTE PTR charBuf[0], dl
; follow the character with null to terminate the string
mov BYTE PTR charBuf[1],0
; save the registers we need to maintain so that stdout doesn't break anything
push eax
push ecx
; print generated string
invoke StdOut, addr charBuf
pop ecx
pop eax
jmp processNextInstruction
processNextInstruction:
inc ebx
; we're finished if we have hit the end of the input
cmp BYTE PTR bfsrc[ebx], 0
je done
jmp processInstruction
done:
; loop through every value in the BF data array and print it
invoke StdOut, addr newline
mov eax, 0
printNext:
; the data array is 100 cells long, so stop looping when we hit cell 100
cmp eax, 100
jge reset
; save value in eax onto the stack
push eax
; convert cell value to string and store it in the character buffer
invoke dwtoa, BYTE PTR bfcells[eax], addr charBuf
; print the buffer, followed by a space
invoke StdOut, addr charBuf
invoke StdOut, addr space
; restore and increment value of eax
pop eax
inc eax
jmp printNext
; when processing is complete, go back to the beginning and take new input
reset:
invoke StdOut, addr newline
invoke StdOut, addr hr
invoke StdOut, addr newline
jmp start
exit:
invoke ExitProcess,0
EvalBf endp
end EvalBf
2 Answers 2
mov eax,0 ; eax is BF data pointer mov ebx,0 ; ebx is BF source pointer mov ecx,0 ; ecx is loop depth
If you used the EDI
register for the BF data pointer and the ESI
register for the BF source pointer, not only would this be a more natural choice, you could also dismiss the push eax
and pop eax
around invoke StdOut, addr charBuf
.
By further using EBX
for the loop depth you can also eliminate the need for push ecx
and pop ecx
around the same invoke StdOut, addr charBuf
.
Do note that clearing a register is better done trough an xor
instruction:
xor edi, edi ; EDI is BF data pointer
xor esi, esi ; ESI is BF source pointer
xor ebx, ebx ; EBX is loop depth
cmp BYTE PTR bfsrc[ebx], 0 je done jmp processInstruction done:
Here's a clear opportunity to optimize the code. Instead of conditionally jumping to done you can use the opposite conditional jump and fall through. This saves an instruction:
cmp BYTE PTR bfsrc[ebx], 0
jne processInstruction
done:
mov eax, 0 printNext: cmp eax, 100 jge reset push eax ... pop eax inc eax jmp printNext
You've used a WHILE-loop here. A REPEAT-loop would have been more optimal. Also a better way to zero a register is by xor
-ing it with itself. Furthermore by using the EDI
register you eliminate the need for the push eax
and pop eax
:
xor edi, edi
printNext:
...
inc edi
cmp edi, 100
jl printNext
reset:
mov dl, BYTE PTR bfcells[eax] mov BYTE PTR charBuf[0], dl ; follow the character with null to terminate the string mov BYTE PTR charBuf[1],0
Use the movzx
here and shave off an instruction:
movzx edx, BYTE PTR bfcells[eax]
mov WORD PTR charBuf[0], dx ; follow the character with null to terminate the string
-
\$\begingroup\$ Great, thanks for the review! I've seen that "
xor
to clear" pattern used fairly widely; why is that preferred? Is it faster, or just clearer to those who are experienced in Assembly? Also, how do I choose between using the general-purpose registers and usingEDI
andESI
(how do you know that those ones are safe)? Finally, it looks like you indented the code in your revised snippets; is that a preferred practice in assembly, even if there isn't a inherent place to do it in the language (e.g. there are no braces)? \$\endgroup\$Wasabi Fan– Wasabi Fan2016年05月29日 23:30:18 +00:00Commented May 29, 2016 at 23:30 -
2\$\begingroup\$
xor reg,reg
is the preferred way to zero a register because it has a shorter encoding (2 bytes against 5 for themov reg,0
) and because the manufacturer (Intel) made it extra fast precisely for doing that. \$\endgroup\$Sep Roland– Sep Roland2016年05月30日 15:18:48 +00:00Commented May 30, 2016 at 15:18 -
\$\begingroup\$ Registers like
EDI
,ESI
andEBX
are non-volatile, meaning that the code that uses these must preserve them. This implies e.g. that the code that performsStdOut
will not change these registers. This allows you to have certain values in them without having topush
/pop
them around aninvoke StdOut ...
. \$\endgroup\$Sep Roland– Sep Roland2016年05月30日 15:23:07 +00:00Commented May 30, 2016 at 15:23 -
2\$\begingroup\$ Indenting the code is strongly adviced because it makes the code much more readable. How you indent is a matter of personal taste. At the very least I think the labels should stand out from the rest of the code. \$\endgroup\$Sep Roland– Sep Roland2016年05月30日 15:25:43 +00:00Commented May 30, 2016 at 15:25
Here are a number of things that may help you improve your code.
Eliminate "magic numbers"
There are a few numbers in the code, such as 200
and 100
that have a specific meaning in their particular context. By using named constants such as PROGRAM_MAX_LEN
or DATA_MAX_LEN
, the program becomes easier to read and maintain.
Avoid obsolete tools and habits
These days, it's unlikely your computer is actually running with a 80386 processor, so it probably doesn't make much sense to use the .386
directive at the top of the program. Similarly, it appears that you're using "masm32" which based around a 32-bit architecture. It probably makes more sense to use more modern tools and techniques such as using Microsoft's ml64
or the open source NASM
rather than the rather archaic masm32
with its late 1990s (and probably not licensed) version of Microsoft's MASM. If you're going to take the time to learn assembly language, it makes sense not to start with something that's already obsolete.
Examine the listing file
When you write assembly language code, it's often very useful to look at the resulting listing file which shows exactly how the assembler has encoded your instructions. For example, the code includes the following lines:
mov eax,0 ; eax is BF data pointer
mov ebx,0 ; ebx is BF source pointer
mov ecx,0 ; ecx is loop depth
Here's how they are encoded:
0000002C B8 00000000 mov eax,0 ; eax is BF data pointer
00000031 BB 00000000 mov ebx,0 ; ebx is BF source pointer
00000036 B9 00000000 mov ecx,0 ; ecx is loop depth
As you can see, each of those instructions is 5 bytes long for a total of 15 bytes for these three instructions. This helps to understand why some things are done the way they are, as in the following suggestion:
Prefer shorter instructions
The common idiom in x86 assembly language programming (and in some other assembly languages as well) is to use the xor
instruction to clear a register. So the three lines mentioned above could be written like this:
xor eax,eax ; eax is BF data pointer
xor ebx,ebx ; ebx is BF source pointer
xor ecx,ecx ; ecx is loop depth
The resulting listing looks like this:
0000002C 33 C0 xor eax,eax ; eax is BF data pointer
0000002E 33 DB xor ebx,ebx ; ebx is BF source pointer
00000030 33 C9 xor ecx,ecx ; ecx is loop depth
These are encoded as 2 bytes each for a total of 6 bytes saving 3 bytes per instruction or 9 bytes total. This should make it clear why xor
is preferred and why looking at the listing files is useful.
Avoid unecessary branching
At the bottom of the loop, we have this
jmp printNext
; when processing is complete, go back to the beginning and take new input
reset:
invoke printf, addr newline
invoke printf, addr hr
invoke printf, addr newline
jmp start
exit:
invoke ExitProcess,0
Note that the reset
code always jumps to start
and that the only way to get to reset
is via a branching instruction (because the unconditional jmp
preceeding it means that the code will never "fall through" to reset
).
We can rearrange this to eliminate the jmp start
by putting this code just above the start
label:
reset:
invoke printf, addr newline
invoke printf, addr hr
invoke printf, addr newline
;jmp start NO LONGER NEEDED
start::
Now the jmp
is eliminated since start
is the next location anyway. Note that start
now has two colons after it, making it a public symbol. This is necessary because the END
instruction must now be adjusted to be END start
(or alternatively, use the /entry:start
command line option). Also, I have used the external C function printf
instead of the masm32 things and started the labels at the left margin which is the most common style to format assembly language code.
In a similar vein, instead of this:
je done
jmp processInstruction
done:
write this:
jne processInstruction
done:
Think carefully about data ordering
In the lines quoted above, the hr
data is printed, surrounded by newline
. Further, the only time hr
is used, it's printed this way. This suggests a small optimization:
hr BYTE 10, 50 dup('-') ; fake horizontal rule (ends with newline below)
newline BYTE 10,0 ; ASCII 10 is \n
invoke printf, addr hr ; print newline, hr, newline
Now the hr
line contains the leading newline (which is only a single additional byte) but eliminates the need for a terminating 0
because it actually gets terminated with the newline
immediately below it. Thus the data portion of the code actually remains the same size but the code is reduced from three calls to one. If we look at the listing, it's a non-trivial savings. Here's how it was originally:
invoke printf, addr newline
00000117 68 0000014F R * push OFFSET newline
0000011C E8 00000000 E * call printf
00000121 83 C4 04 * add esp, 000000004h
invoke printf, addr hr
00000124 68 00000153 R * push OFFSET hr
00000129 E8 00000000 E * call printf
0000012E 83 C4 04 * add esp, 000000004h
invoke printf, addr newline
00000131 68 0000014F R * push OFFSET newline
00000136 E8 00000000 E * call printf
0000013B 83 C4 04 * add esp, 000000004h
0000013E E9 FFFFFEBD jmp start
The revised version (including the suggestion above) now looks like this:
invoke printf, addr hr
00000000 68 00000151 R * push OFFSET hr
00000005 E8 00000000 E * call printf
0000000A 83 C4 04 * add esp, 000000004h
;jmp start
It's important to realize that although the INVOKE
makes thing look small, there are actuallly several generated lines that result. The combination of these two suggestions saves 31 bytes plus the runtime overhead of two function calls.
Prefer registers to memory access
Register access is generally faster than memory access. For that reason, for fast, small and efficient assembly language code, we generally think very carefully about register use. One easy and obvious savings suggests itself when we look at the listing for the various cmp
instructions:
0000003B 80 BB 0000000A R cmp BYTE PTR bfsrc[ebx], '+'
2B
00000042 74 38 je plus
As you can see, each of the cmp
instructions both accesses memory and is 7 bytes long. We can do better:
mov al, BYTE PTR bfsrc[ebx]
cmp al, '+'
je plus
Here is what that looks like in the listing:
0000003F 8A 83 0000000A R mov al, BYTE PTR bfsrc[ebx]
00000045 3C 2B cmp al, '+'
00000047 74 1A je plus
Now we simply load the instruction into the al
register once and then compare it with the potential match. This saves 29 bytes of code. Also, I changed the code to use the edi
register rather than the eax
register for the BF data pointer.
Consider data-driven versus code-driven code
The essential job of the interpreter is to read an instruction and then execute the corresponding code for it. This suggests a table structure:
inst STRUC
token BYTE ?
codeptr DD ?
inst ENDS
bf inst {'+', plus}
inst {'-', minus}
inst {'>', fwd}
inst {'<', back}
inst {'[', open}
inst {']', close}
inst {'.', dot}
inst { 0, dump}
inst { 0, 0 } ; end of list
Here's the code to use that structure:
processInstruction:
; get next token
mov al, BYTE PTR bfsrc[ebx]
; reset the table pointer
lea esi, bf
; push return location
push processNextInstruction
nextinst:
cmp inst.token[esi], al
jne advance
jmp inst.codeptr[esi]
advance:
add esi, SIZEOF inst
cmp inst.codeptr[esi], 0
jne nextinst
pop esi ; throw away return value
processNextInstruction:
inc ebx
jmp processInstruction
EvalBf endp
Now each of the routines is made into a subroutine along these lines:
fwd PROC
inc edi
ret
fwd endp
open PROC
; push the current source position
; onto the loop stack
mov loopStack[ecx*4],ebx
inc ecx
ret
open ENDP
It works by first pushing the location of processNextInstruction
, then stepping through the table until it finds a matching token. When the matching token is found, such as fwd
above, then the code performs a jmp
to the subroutine and the ret
at the end of the subroutine causes the code to end up at processNextInstruction
.
Putting it all together
With most of those changes made (and a few more optimizations), here is what I get as a result:
.686
.model flat,stdcall
.stack 4096
includelib msvcrt
ExitProcess proto,dwExitCode:DWORD
printf proto C:VARARG
putchar proto C, char:DWORD
getchar proto C
scanf proto C:VARARG
PROGRAM_MAX_LEN equ 200
DATA_MAX_LEN equ 100
MAX_NESTED_LOOPS equ 5
.data
bfsrc BYTE PROGRAM_MAX_LEN dup(0) ; buffer to store source code
bfcells BYTE DATA_MAX_LEN dup(0) ; data array
; stores the position of the first instruction in the current loop.
loopStack DD MAX_NESTED_LOOPS dup(0)
.const
decfmt BYTE "%d ",0
scanfmt BYTE "%200s",0
prompt BYTE "$",0 ; input prompt string
hr BYTE 10, 50 dup('-') ; fake horizontal rule (ends with newline below)
newline BYTE 10,0 ; ASCII 10 is \n
inst STRUC
token BYTE ?
codeptr DD ?
inst ENDS
bf inst {'+', plus}
inst {'-', minus}
inst {'>', fwd}
inst {'<', back}
inst {'[', open}
inst {']', close}
inst {'.', dot}
inst {',', comma}
inst { 0, dump}
inst { 0, 0 } ; end of list
.code
EvalBf proc
; print the prompt and then read input into the source array
invoke printf, addr prompt
lea ebx,bfsrc ; ebx is BF source pointer
invoke scanf, addr scanfmt, ebx
lea edi,bfcells ; edi is BF data pointer
xor ecx,ecx ; ecx is loop depth
dec ebx
processNextInstruction:
inc ebx
processInstruction:
; jump according to current source char
mov al, BYTE PTR [ebx]
lea esi, bf
; push return location
push processNextInstruction
nextinst:
cmp inst.token[esi], al
jne advance
jmp inst.codeptr[esi]
advance:
add esi, SIZEOF inst
cmp inst.codeptr[esi], 0
jne nextinst
ret ; actually goes to `processNextInstruction`
EvalBf endp
dump PROC
pop esi ; throw away return value
push EvalBf ; redirect to reset
; loop through every value in the BF data array and print it
invoke printf, addr newline
lea edi, bfcells
printNext:
; print the buffer, followed by a space
movzx edx, BYTE PTR [edi]
invoke printf, addr decfmt, edx
; advance the cell pointer
inc edi
; the data array is 100 cells long, so stop looping when we hit cell 100
cmp edi, OFFSET bfcells + DATA_MAX_LEN
jb printNext
; when processing is complete, go back to the beginning and take new input
invoke printf, addr hr
ret
dump ENDP
exit PROC
invoke ExitProcess,0
exit ENDP
plus PROC
inc BYTE PTR [edi]
ret
plus ENDP
minus PROC
dec BYTE PTR [edi]
ret
minus ENDP
fwd PROC
inc edi
ret
fwd endp
back PROC
dec edi
ret
back ENDP
open PROC
; push the current source position
; onto the loop stack
mov loopStack[ecx*4],ebx
inc ecx
ret
open ENDP
close PROC
dec ecx
cmp BYTE PTR [edi], 0
; break out of loop if data cell is 0
je endloop
; pop the innermost loop position and
; set it as the next instruction
mov ebx,loopStack[ecx*4]
inc ecx
endloop:
ret
close ENDP
dot PROC
movzx eax, BYTE PTR [edi]
invoke putchar, eax
ret
dot ENDP
comma PROC
invoke getchar
mov BYTE PTR [edi], al
ret
comma ENDP
end EvalBf
There are more things that could be improved, such as maximizing register use, but this should get you started.
For reference, I used Visual Studio 12.0. After setting the environment variables using vcvarsall.bat
as supplied by Microsoft, the command line to assemble the code was:
ml /Fm /Fl /Sa bf.asm /link /subsystem:console /defaultlib:kernel32.lib /defaultlib:user32.lib
For comparison, the segment sizess of both the original code (modified minimally to run without masm32) and the modified code above were:
old code new code CONST 0 115 _DATA 392 320 CONST+_DATA 392 435 _TEXT 330 200 total 722 635
So the result is that the new code is 87 bytes shorter (about 10%) and actually includes code for all 8 of the instruction tokens, so it's actually both shorter and more functional than the original.
-
\$\begingroup\$ I had used the MASM32 libs because I couldn't persuade Visual Studio to build my code when using
printf
and the other C runtime functions. Are you building from within Visual Studio or just editing it there and then assembling from the command line? \$\endgroup\$Wasabi Fan– Wasabi Fan2016年05月30日 18:58:01 +00:00Commented May 30, 2016 at 18:58 -
1