Parsing a ASCII dotted-decimal IPv4 IP into a network byte order integer

Question 1

.globl inet_addr_asm
inet_addr_asm:
 movq 0ドルxff00000000000000, %r8
 movq 0ドルx3000000000000000, %r9 # load ASCII '0' into register nine
 movq 10,ドル %r10
 movq 100,ドル %r11
 movq 8(%rdi), %rsi # load eight byes from string address
 shlq 8,ドル %rsi # get rid of prefixed '.' (if any)
 movq %rsi, %rcx # load string into %rcx
 andq %r8, %rcx # get first byte of string in %rcx
 addq %rcx, %rcx
 jnz found_nonzero
 shlq 8,ドル %rsi
 movq %rsi, %rcx
 andq %r8, %rcx
 addq %rcx, %rcx
 jnz found_nonzero
 shlq 8,ドル %rsi
 movq %rsi, %rcx
 andq %r8, %rcx
 addq %rcx, %rcx
 jnz found_nonzero
 shlq 8,ドル %rsi
 movq %rsi, %rcx
 andq %r8, %rcx
 addq %rcx, %rcx
 jnz found_nonzero
 shlq 8,ドル %rsi
 movq %rsi, %rcx
 andq %r8, %rcx
 addq %rcx, %rcx
 jnz found_nonzero
 shlq 8,ドル %rsi
 movq %rsi, %rcx
 andq %r8, %rcx
 addq %rcx, %rcx
 jnz found_nonzero
 shlq 8,ドル %rsi
 movq %rsi, %rcx
 andq %r8, %rcx
 addq %rcx, %rcx
 jnz found_nonzero
 jmp all_zeros
 found_nonzero:
 slr0:
 movq %rsi, %rcx
 shlq 8,ドル %rsi
 andq %r8, %rcx
 subq %r9, %rcx
 shrq 32,ドル %rcx
 movq %rsi, %rdx
 addq %rdx, %rdx
 jnz slr0_nlr1
 movq (%rdi), %rsi
 jmp nlr0_1
 slr0_nlr1:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb slr1
 mulq %r10
 shrq 32,ドル %rax
 addq %rax, %rcx
 movq %rsi, %rdx
 addq %rdx, %rdx
 jnz slr0_nlr2
 movq (%rdi), %rsi
 jmp nlr0_2
 slr0_nlr2:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb slr1
 mulq %r11
 shrq 32,ドル %rax
 addq %rax, %rcx
 shlq 8,ドル %rsi
 slr1:
 movq %rsi, %rdx
 addq %rdx, %rdx
 jnz slr1_nlr0
 movq (%rdi), %rsi
 jmp nlr1_0
 slr1_nlr0:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb slr2
 shrq 40,ドル %rax
 addq %rax, %rcx
 movq %rsi, %rdx
 addq %rdx, %rdx
 jnz slr1_nlr1
 movq (%rdi), %rsi
 jmp nlr0_1
 slr1_nlr1:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb slr2
 mulq %r10
 shrq 40,ドル %rax
 addq %rax, %rcx
 movq %rsi, %rdx
 addq %rdx, %rdx
 jnz slr1_nlr2
 movq (%rdi), %rsi
 jmp nlr1_2
 slr1_nlr2:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb slr2
 mulq %r11
 shrq 40,ドル %rax
 addq %rax, %rcx
 shlq 8,ドル %rsi
 slr2:
 movq %rsi, %rdx
 addq %rdx, %rdx
 jnz slr2_nlr0
 movq (%rdi), %rsi
 jmp nlr2_0
 slr2_nlr0:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb nlr3
 shrq 48,ドル %rax
 addq %rax, %rcx
 jmp nlr2_1
 nlr0_1:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb nlr1
 mulq %r10
 shrq 32,ドル %rax
 addq %rax, %rcx
 nlr0_2:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb nlr1
 mulq %r11
 shrq 32,ドル %rax
 addq %rax, %rcx
 shlq 8,ドル %rsi
 nlr1:
 nlr1_0:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb nlr2
 shrq 40,ドル %rax
 addq %rax, %rcx
 nlr1_1:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb nlr2
 mulq %r10
 shrq 40,ドル %rax
 addq %rax, %rcx
 nlr1_2:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb nlr2
 mulq %r11
 shrq 40,ドル %rax
 addq %rax, %rcx
 shlq 8,ドル %rsi
 nlr2:
 nlr2_0:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb nlr2
 shrq 48,ドル %rax
 addq %rax, %rcx
 nlr2_1:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb nlr3
 mulq %r10
 shrq 48,ドル %rax
 addq %rax, %rcx
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 jb nlr3
 mulq %r11
 shrq 48,ドル %rax
 addq %rax, %rcx
 shlq 8,ドル %rsi
 nlr3:
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 shrq 56,ドル %rax
 addq %rax, %rcx
 movq %rsi, %rdx
 addq %rdx, %rdx
 jz nlr_end
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 mulq %r10
 shrq 56,ドル %rax
 addq %rax, %rcx
 movq %rsi, %rdx
 addq %rdx, %rdx
 jz nlr_end
 movq %rsi, %rax
 shlq 8,ドル %rsi
 andq %r8, %rax
 subq %r9, %rax
 mulq %r11
 shrq 56,ドル %rax
 addq %rax, %rcx
 nlr_end:
 movq %rcx, %rax
 ret
 all_zeros:
 movq (%rdi), %rsi
 movq 0ドルx0030003000300030, %r9
 subq %r9, %rsi
 shrq 8,ドル %r8
 movq %rsi, %rax
 andq %r8, %rax
 shrq 24,ドル %rax
 movq %rsi, %rcx
 shrq 16,ドル %r8
 andq %r8, %rcx # 0x3000300030003000
 shrq 16,ドル %rcx
 orq %rcx, %rax
 movq %rsi, %rcx
 shrq 16,ドル %r8
 andq %r8, %rcx
 shrq 8,ドル %rcx
 orq %rcx, %rax
 shrq 16,ドル %r8
 andq %r8, %rsi
 orq %rsi, %rax
 ret

Accepts one argument, the pointer to the string. The string is assumed to be zero-padded to 16 bytes. The code is in amd64 assembly, uses no vendor specific processor things AFAIK. Although I did not extensively test it it seems to be working. Any advice is appreciated.

Question 2

You don't say which platform or ABI. Is this for Windows? What's the presumed content of registers when calling this?

Question 3

%rdi contains the pointer to the IP address string. ABI is Linux. Content of registers are irrelevant because I overwrite them before using them anyway.

Question 4

I see a number of things that may help you improve your program

Comment your code

Only five comments in an over 300-line assembly language routine makes understanding this code much much harder than it should be. Add comments to your code to explain what the code is doing and why.

Use better labels

Labels like nlr1_1 don't mean much to me. Either a comment (mentioned above) or a better label name would help a great deal.

Optimize jumps

The code currently contains this code:

 jnz found_nonzero
 jmp all_zeros
found_nonzero:

This could easily be replaced instead with this:

 jz all_zeros
found_nonzero:

Know your instruction set

The code contains multiple repetitions of this sequence:

shlq 8,ドル %rsi
movq %rsi, %rcx
andq %r8, %rcx 
addq %rcx, %rcx
jnz found_nonzero

However, the andq already sets the flags register (include the Z flag) so the addq instruction is never needed. That's good, but we can do still better. All this really does is set the Z flag based on the high byte of %rsi. We don't really need or use the resulting value in %rcx, so we can use the test instruction to set the flags without altering any registers:

shlq 8,ドル %rsi
testq %rsi, %r8
jnz found_nonzero

Don't use a special case

The program has a special case for all_zeros that seems to be an attempt to handle the case in which the trailing 8 bytes of the ASCII string are all zeroes, but it's not really necessary and it introduces a subtle bug. If you call the routine with the string "8.7.4.140円0円0円0円0円0円0円0円" the program incorrectly returns a value of 0x01040708 (it should instead be 0x0e040708). It would be better to simply avoid the special case which adds both more code and a bug in this code, and process all strings identically. Instead, the last few lines above the found_nonzero label could be this:

 jnz found_nonzero
 movq (%rdi), %rsi
 testq %rsi, %r8
 jnz found_nonzero
 shlq 8,ドル %rsi
found_nonzero:

And all of the code from all_zeros to the end of the program can simply be deleted.

Use conventional indenting

The convention for assembly language programs is to have the program labels non-indented and to have instructions indented. This makes it much easier to see labels in the program.

Consider an alternative algorithm

Right now there is a lot of duplication and nearly identical code. The code could be greatly simplified by a change in algorithm. In C, it could be coded like this:

unsigned inet_addr(const char *str)
{
 unsigned accum = 0;
 unsigned shiftval = 0;
 unsigned num = 0;
 do {
 unsigned val = *str++;
 if (val == '.' || val == 0) {
 accum |= (num << shiftval);
 shiftval += 8;
 num = 0;
 } else {
 num = 10*num + (val - '0');
 }
 } while (shiftval < 32);
 return accum;
}

I trust that you can reliably turn this into the corresponding assembly language code. Here's how I did it:

.globl inet_addr_asm2
# converts a C-string containing an ASCII representation of an 
# IPv4 address to a network-order 32-bit number. 
# 
# INPUTS:
# rdi = pointer to C string such as "192.168.100.3"
# OUTPUTS:
# rax = corresponding 32-bit value, such as 0x0364a8c0
# TRASHES:
# rbx, cl, rdx, rdi, rsi, r8
# 
# callable from C under x64 Linux ABI as with prototype:
# unsigned inet_addr_asm2(const char *addr);
#
inet_addr_asm2:
 xorq %rsi, %rsi # rsi = accum = 0
 xorb %cl, %cl # cl = shiftval = 0
 xorq %rax, %rax # rax = num = 0
 movq 10,ドル %r8 # r8 = constant 10
loop_top:
 xorq %rbx, %rbx # clear high bits of rbx
 movb (%rdi), %bl # load just one byte
 inc %rdi # increment pointer
 subq $'0', %rbx # subtract '0'
 jb new_digit # jump if it's < '0' (e.g. '.' or NUL)
 mul %r8 # rdx:rax = 10 * num
 addq %rbx, %rax # num = 10 * num + (val - '0')
 jmp loop_top # keep fetching chars
new_digit:
 shlq %cl, %rax # num <<= shiftval 
 orq %rax, %rsi # accum |= (num << shiftval);
 xorq %rax, %rax # num = 0
 add 8,ドル %cl # shiftval += 8 
loop_bottom:
 cmp 32,ドル %cl # keep going while shiftval < 32
 jb loop_top
 mov %rsi, %rax # return accum
 ret

This code is 58 bytes. That's less than 10% of the size of the original and it doesn't include the "special case" bug mentioned above.

Question 5

I apologize if my questions seem trivial. I think I heard somewhere that loading eight bytes into registers at a time is more efficient than loading a single byte. What is the rationale behind loading a single byte at a time? Also, since the loop is a fixed length loop, is it better(in terms of performance) to simply use constants and repeat the code four times?

Question 6

In general, I tend to try to write code that is correct and easy to understand (and maintain). Only when I have that do I consider optimizing for speed or space. Then if I do decide to optimize for either, I measure the results. You could learn a lot about real performance by measuring the speed on your own machine and trying experiments to see which things actually make it faster.

Edward Edward 67.2k4 gold badges120 silver badges284 bronze badges · Accepted Answer · 2016-03-02 17:10:32Z

I see a number of things that may help you improve your program

Comment your code

Only five comments in an over 300-line assembly language routine makes understanding this code much much harder than it should be. Add comments to your code to explain what the code is doing and why.

Use better labels

Labels like nlr1_1 don't mean much to me. Either a comment (mentioned above) or a better label name would help a great deal.

Optimize jumps

The code currently contains this code:

 jnz found_nonzero
 jmp all_zeros
found_nonzero:

This could easily be replaced instead with this:

 jz all_zeros
found_nonzero:

Know your instruction set

The code contains multiple repetitions of this sequence:

shlq 8,ドル %rsi
movq %rsi, %rcx
andq %r8, %rcx 
addq %rcx, %rcx
jnz found_nonzero

However, the andq already sets the flags register (include the Z flag) so the addq instruction is never needed. That's good, but we can do still better. All this really does is set the Z flag based on the high byte of %rsi. We don't really need or use the resulting value in %rcx, so we can use the test instruction to set the flags without altering any registers:

shlq 8,ドル %rsi
testq %rsi, %r8
jnz found_nonzero

Don't use a special case

The program has a special case for all_zeros that seems to be an attempt to handle the case in which the trailing 8 bytes of the ASCII string are all zeroes, but it's not really necessary and it introduces a subtle bug. If you call the routine with the string "8.7.4.140円0円0円0円0円0円0円0円" the program incorrectly returns a value of 0x01040708 (it should instead be 0x0e040708). It would be better to simply avoid the special case which adds both more code and a bug in this code, and process all strings identically. Instead, the last few lines above the found_nonzero label could be this:

 jnz found_nonzero
 movq (%rdi), %rsi
 testq %rsi, %r8
 jnz found_nonzero
 shlq 8,ドル %rsi
found_nonzero:

And all of the code from all_zeros to the end of the program can simply be deleted.

Use conventional indenting

The convention for assembly language programs is to have the program labels non-indented and to have instructions indented. This makes it much easier to see labels in the program.

Consider an alternative algorithm

Right now there is a lot of duplication and nearly identical code. The code could be greatly simplified by a change in algorithm. In C, it could be coded like this:

unsigned inet_addr(const char *str)
{
 unsigned accum = 0;
 unsigned shiftval = 0;
 unsigned num = 0;
 do {
 unsigned val = *str++;
 if (val == '.' || val == 0) {
 accum |= (num << shiftval);
 shiftval += 8;
 num = 0;
 } else {
 num = 10*num + (val - '0');
 }
 } while (shiftval < 32);
 return accum;
}

I trust that you can reliably turn this into the corresponding assembly language code. Here's how I did it:

.globl inet_addr_asm2
# converts a C-string containing an ASCII representation of an 
# IPv4 address to a network-order 32-bit number. 
# 
# INPUTS:
# rdi = pointer to C string such as "192.168.100.3"
# OUTPUTS:
# rax = corresponding 32-bit value, such as 0x0364a8c0
# TRASHES:
# rbx, cl, rdx, rdi, rsi, r8
# 
# callable from C under x64 Linux ABI as with prototype:
# unsigned inet_addr_asm2(const char *addr);
#
inet_addr_asm2:
 xorq %rsi, %rsi # rsi = accum = 0
 xorb %cl, %cl # cl = shiftval = 0
 xorq %rax, %rax # rax = num = 0
 movq 10,ドル %r8 # r8 = constant 10
loop_top:
 xorq %rbx, %rbx # clear high bits of rbx
 movb (%rdi), %bl # load just one byte
 inc %rdi # increment pointer
 subq $'0', %rbx # subtract '0'
 jb new_digit # jump if it's < '0' (e.g. '.' or NUL)
 mul %r8 # rdx:rax = 10 * num
 addq %rbx, %rax # num = 10 * num + (val - '0')
 jmp loop_top # keep fetching chars
new_digit:
 shlq %cl, %rax # num <<= shiftval 
 orq %rax, %rsi # accum |= (num << shiftval);
 xorq %rax, %rax # num = 0
 add 8,ドル %cl # shiftval += 8 
loop_bottom:
 cmp 32,ドル %cl # keep going while shiftval < 32
 jb loop_top
 mov %rsi, %rax # return accum
 ret

This code is 58 bytes. That's less than 10% of the size of the original and it doesn't include the "special case" bug mentioned above.

I apologize if my questions seem trivial. I think I heard somewhere that loading eight bytes into registers at a time is more efficient than loading a single byte. What is the rationale behind loading a single byte at a time? Also, since the loop is a fixed length loop, is it better(in terms of performance) to simply use constants and repeat the code four times?
In general, I tend to try to write code that is correct and easy to understand (and maintain). Only when I have that do I consider optimizing for speed or space. Then if I do decide to optimize for either, I measure the results. You could learn a lot about real performance by measuring the speed on your own machine and trying experiments to see which things actually make it faster.

Stack Exchange Network

Parsing a ASCII dotted-decimal IPv4 IP into a network byte order integer

1 Answer 1

Comment your code

Use better labels

Optimize jumps

Know your instruction set

Don't use a special case

Use conventional indenting

Consider an alternative algorithm

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Parsing a ASCII dotted-decimal IPv4 IP into a network byte order integer

1 Answer 1

Comment your code

Use better labels

Optimize jumps

Know your instruction set

Don't use a special case

Use conventional indenting

Consider an alternative algorithm

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions