.globl inet_addr_asm
inet_addr_asm:
movq 0ドルxff00000000000000, %r8
movq 0ドルx3000000000000000, %r9 # load ASCII '0' into register nine
movq 10,ドル %r10
movq 100,ドル %r11
movq 8(%rdi), %rsi # load eight byes from string address
shlq 8,ドル %rsi # get rid of prefixed '.' (if any)
movq %rsi, %rcx # load string into %rcx
andq %r8, %rcx # get first byte of string in %rcx
addq %rcx, %rcx
jnz found_nonzero
shlq 8,ドル %rsi
movq %rsi, %rcx
andq %r8, %rcx
addq %rcx, %rcx
jnz found_nonzero
shlq 8,ドル %rsi
movq %rsi, %rcx
andq %r8, %rcx
addq %rcx, %rcx
jnz found_nonzero
shlq 8,ドル %rsi
movq %rsi, %rcx
andq %r8, %rcx
addq %rcx, %rcx
jnz found_nonzero
shlq 8,ドル %rsi
movq %rsi, %rcx
andq %r8, %rcx
addq %rcx, %rcx
jnz found_nonzero
shlq 8,ドル %rsi
movq %rsi, %rcx
andq %r8, %rcx
addq %rcx, %rcx
jnz found_nonzero
shlq 8,ドル %rsi
movq %rsi, %rcx
andq %r8, %rcx
addq %rcx, %rcx
jnz found_nonzero
jmp all_zeros
found_nonzero:
slr0:
movq %rsi, %rcx
shlq 8,ドル %rsi
andq %r8, %rcx
subq %r9, %rcx
shrq 32,ドル %rcx
movq %rsi, %rdx
addq %rdx, %rdx
jnz slr0_nlr1
movq (%rdi), %rsi
jmp nlr0_1
slr0_nlr1:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb slr1
mulq %r10
shrq 32,ドル %rax
addq %rax, %rcx
movq %rsi, %rdx
addq %rdx, %rdx
jnz slr0_nlr2
movq (%rdi), %rsi
jmp nlr0_2
slr0_nlr2:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb slr1
mulq %r11
shrq 32,ドル %rax
addq %rax, %rcx
shlq 8,ドル %rsi
slr1:
movq %rsi, %rdx
addq %rdx, %rdx
jnz slr1_nlr0
movq (%rdi), %rsi
jmp nlr1_0
slr1_nlr0:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb slr2
shrq 40,ドル %rax
addq %rax, %rcx
movq %rsi, %rdx
addq %rdx, %rdx
jnz slr1_nlr1
movq (%rdi), %rsi
jmp nlr0_1
slr1_nlr1:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb slr2
mulq %r10
shrq 40,ドル %rax
addq %rax, %rcx
movq %rsi, %rdx
addq %rdx, %rdx
jnz slr1_nlr2
movq (%rdi), %rsi
jmp nlr1_2
slr1_nlr2:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb slr2
mulq %r11
shrq 40,ドル %rax
addq %rax, %rcx
shlq 8,ドル %rsi
slr2:
movq %rsi, %rdx
addq %rdx, %rdx
jnz slr2_nlr0
movq (%rdi), %rsi
jmp nlr2_0
slr2_nlr0:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb nlr3
shrq 48,ドル %rax
addq %rax, %rcx
jmp nlr2_1
nlr0_1:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb nlr1
mulq %r10
shrq 32,ドル %rax
addq %rax, %rcx
nlr0_2:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb nlr1
mulq %r11
shrq 32,ドル %rax
addq %rax, %rcx
shlq 8,ドル %rsi
nlr1:
nlr1_0:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb nlr2
shrq 40,ドル %rax
addq %rax, %rcx
nlr1_1:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb nlr2
mulq %r10
shrq 40,ドル %rax
addq %rax, %rcx
nlr1_2:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb nlr2
mulq %r11
shrq 40,ドル %rax
addq %rax, %rcx
shlq 8,ドル %rsi
nlr2:
nlr2_0:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb nlr2
shrq 48,ドル %rax
addq %rax, %rcx
nlr2_1:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb nlr3
mulq %r10
shrq 48,ドル %rax
addq %rax, %rcx
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
jb nlr3
mulq %r11
shrq 48,ドル %rax
addq %rax, %rcx
shlq 8,ドル %rsi
nlr3:
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
shrq 56,ドル %rax
addq %rax, %rcx
movq %rsi, %rdx
addq %rdx, %rdx
jz nlr_end
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
mulq %r10
shrq 56,ドル %rax
addq %rax, %rcx
movq %rsi, %rdx
addq %rdx, %rdx
jz nlr_end
movq %rsi, %rax
shlq 8,ドル %rsi
andq %r8, %rax
subq %r9, %rax
mulq %r11
shrq 56,ドル %rax
addq %rax, %rcx
nlr_end:
movq %rcx, %rax
ret
all_zeros:
movq (%rdi), %rsi
movq 0ドルx0030003000300030, %r9
subq %r9, %rsi
shrq 8,ドル %r8
movq %rsi, %rax
andq %r8, %rax
shrq 24,ドル %rax
movq %rsi, %rcx
shrq 16,ドル %r8
andq %r8, %rcx # 0x3000300030003000
shrq 16,ドル %rcx
orq %rcx, %rax
movq %rsi, %rcx
shrq 16,ドル %r8
andq %r8, %rcx
shrq 8,ドル %rcx
orq %rcx, %rax
shrq 16,ドル %r8
andq %r8, %rsi
orq %rsi, %rax
ret
Accepts one argument, the pointer to the string. The string is assumed to be zero-padded to 16 bytes. The code is in amd64 assembly, uses no vendor specific processor things AFAIK. Although I did not extensively test it it seems to be working. Any advice is appreciated.
-
\$\begingroup\$ You don't say which platform or ABI. Is this for Windows? What's the presumed content of registers when calling this? \$\endgroup\$Edward– Edward2016年03月02日 13:47:36 +00:00Commented Mar 2, 2016 at 13:47
-
\$\begingroup\$ %rdi contains the pointer to the IP address string. ABI is Linux. Content of registers are irrelevant because I overwrite them before using them anyway. \$\endgroup\$user69874– user698742016年03月02日 15:24:30 +00:00Commented Mar 2, 2016 at 15:24
1 Answer 1
I see a number of things that may help you improve your program
Comment your code
Only five comments in an over 300-line assembly language routine makes understanding this code much much harder than it should be. Add comments to your code to explain what the code is doing and why.
Use better labels
Labels like nlr1_1
don't mean much to me. Either a comment (mentioned above) or a better label name would help a great deal.
Optimize jumps
The code currently contains this code:
jnz found_nonzero
jmp all_zeros
found_nonzero:
This could easily be replaced instead with this:
jz all_zeros
found_nonzero:
Know your instruction set
The code contains multiple repetitions of this sequence:
shlq 8,ドル %rsi
movq %rsi, %rcx
andq %r8, %rcx
addq %rcx, %rcx
jnz found_nonzero
However, the andq
already sets the flags register (include the Z flag) so the addq
instruction is never needed. That's good, but we can do still better. All this really does is set the Z flag based on the high byte of %rsi
. We don't really need or use the resulting value in %rcx
, so we can use the test
instruction to set the flags without altering any registers:
shlq 8,ドル %rsi
testq %rsi, %r8
jnz found_nonzero
Don't use a special case
The program has a special case for all_zeros
that seems to be an attempt to handle the case in which the trailing 8 bytes of the ASCII string are all zeroes, but it's not really necessary and it introduces a subtle bug. If you call the routine with the string "8.7.4.140円0円0円0円0円0円0円0円"
the program incorrectly returns a value of 0x01040708
(it should instead be 0x0e040708
). It would be better to simply avoid the special case which adds both more code and a bug in this code, and process all strings identically. Instead, the last few lines above the found_nonzero
label could be this:
jnz found_nonzero
movq (%rdi), %rsi
testq %rsi, %r8
jnz found_nonzero
shlq 8,ドル %rsi
found_nonzero:
And all of the code from all_zeros
to the end of the program can simply be deleted.
Use conventional indenting
The convention for assembly language programs is to have the program labels non-indented and to have instructions indented. This makes it much easier to see labels in the program.
Consider an alternative algorithm
Right now there is a lot of duplication and nearly identical code. The code could be greatly simplified by a change in algorithm. In C, it could be coded like this:
unsigned inet_addr(const char *str)
{
unsigned accum = 0;
unsigned shiftval = 0;
unsigned num = 0;
do {
unsigned val = *str++;
if (val == '.' || val == 0) {
accum |= (num << shiftval);
shiftval += 8;
num = 0;
} else {
num = 10*num + (val - '0');
}
} while (shiftval < 32);
return accum;
}
I trust that you can reliably turn this into the corresponding assembly language code. Here's how I did it:
.globl inet_addr_asm2
# converts a C-string containing an ASCII representation of an
# IPv4 address to a network-order 32-bit number.
#
# INPUTS:
# rdi = pointer to C string such as "192.168.100.3"
# OUTPUTS:
# rax = corresponding 32-bit value, such as 0x0364a8c0
# TRASHES:
# rbx, cl, rdx, rdi, rsi, r8
#
# callable from C under x64 Linux ABI as with prototype:
# unsigned inet_addr_asm2(const char *addr);
#
inet_addr_asm2:
xorq %rsi, %rsi # rsi = accum = 0
xorb %cl, %cl # cl = shiftval = 0
xorq %rax, %rax # rax = num = 0
movq 10,ドル %r8 # r8 = constant 10
loop_top:
xorq %rbx, %rbx # clear high bits of rbx
movb (%rdi), %bl # load just one byte
inc %rdi # increment pointer
subq $'0', %rbx # subtract '0'
jb new_digit # jump if it's < '0' (e.g. '.' or NUL)
mul %r8 # rdx:rax = 10 * num
addq %rbx, %rax # num = 10 * num + (val - '0')
jmp loop_top # keep fetching chars
new_digit:
shlq %cl, %rax # num <<= shiftval
orq %rax, %rsi # accum |= (num << shiftval);
xorq %rax, %rax # num = 0
add 8,ドル %cl # shiftval += 8
loop_bottom:
cmp 32,ドル %cl # keep going while shiftval < 32
jb loop_top
mov %rsi, %rax # return accum
ret
This code is 58 bytes. That's less than 10% of the size of the original and it doesn't include the "special case" bug mentioned above.
-
\$\begingroup\$ I apologize if my questions seem trivial. I think I heard somewhere that loading eight bytes into registers at a time is more efficient than loading a single byte. What is the rationale behind loading a single byte at a time? Also, since the loop is a fixed length loop, is it better(in terms of performance) to simply use constants and repeat the code four times? \$\endgroup\$user69874– user698742016年03月02日 20:32:49 +00:00Commented Mar 2, 2016 at 20:32
-
1\$\begingroup\$ In general, I tend to try to write code that is correct and easy to understand (and maintain). Only when I have that do I consider optimizing for speed or space. Then if I do decide to optimize for either, I measure the results. You could learn a lot about real performance by measuring the speed on your own machine and trying experiments to see which things actually make it faster. \$\endgroup\$Edward– Edward2016年03月02日 20:40:33 +00:00Commented Mar 2, 2016 at 20:40
Explore related questions
See similar questions with these tags.