Re: problems with memory allocation and the alignment check
From: Michael J. Baars
Date: Wed Feb 24 2021 - 02:07:44 EST
On Mon, 2021年02月22日 at 01:41 -0800, Andrew Pinski wrote:
>
On Mon, Feb 22, 2021 at 1:37 AM Michael J. Baars
>
<mjbaars1977.gcc@xxxxxxxxxxxxx> wrote:
>
> On Mon, 2021年02月22日 at 01:29 -0800, Andrew Pinski wrote:
>
> > On Mon, Feb 22, 2021 at 1:17 AM Michael J. Baars
>
> > <mjbaars1977.gcc@xxxxxxxxxxxxx> wrote:
>
> > > Hi,
>
> > >
>
> > > I just wrote this little program to demonstrate a possible flaw in both malloc and calloc.
>
> > >
>
> > > If I allocate a the simplest memory region from main(), one out of three optimization flags fail.
>
> > > If I allocate the same region from a function, three out of three optimization flags fail.
>
> > >
>
> > > Does someone know if this really is a flaw, and if so, is it a gcc or a kernel flaw?
>
> >
>
> > There is no flaw. GCC (kernel, glibc) all assume unaligned accesses
>
> > on x86 will not cause an exception.
>
>
>
> Is this just an assumption or more like a fact? I agree with you that byte aligned is more or less the same as unaligned.
>
>
It is an assumption that is even made inside GCC. You can modify GCC
>
not to assume that but you need to recompile all libraries and even
>
check the assembly code that is included with most programs.
>
Why are you enabling the alignment access check anyways? What are you
>
trying to do?
>
If you are looking into a performance issue with unaligned accesses,
>
may I suggest you look into perf to see if you can see unaligned
>
accesses?
Next to performance and correctness, I always try to keep in mind that every clock cycle will eventually end up on the energy bill, to avoid that computers cost
ten times more on the energy bill then they do in the store.
If you look at the power consumption of the Playstation 1 vs that of the Playstation 3 for example, you will see that the Playstation 1 uses (10 W / 240 V
= 0.041666667 A max, while the Playstation 3 consumes 240 V * 1.7 A = 408 W. More than 40 times as much energy!!!
Code and style always go hand in hand. Try to keep you code as sleek as possible and you will see that even an old computer can do a lot more than you ever
thought possible :)
Thanks,
Mischa.
>
Thanks,
>
Andrew
>
>
> > Thanks,
>
> > Andrew
>
> >
>
> > > Regards,
>
> > > Mischa.
#include <stdint.h>
#include "compression.h"
uint8_t data_s[256] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
};
/*
0000000000000000 <compression_encode_prepare1>:
0: 48 89 f9 mov %rdi,%rcx
3: 31 d2 xor %edx,%edx
5: b8 00 00 00 01 mov 0ドルx1000000,%eax
a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
10: 48 83 e8 01 sub 0ドルx1,%rax
14: 75 fa jne 10 <compression_encode_prepare1+0x10>
16: 88 11 mov %dl,(%rcx)
18: 48 83 c2 01 add 0ドルx1,%rdx
1c: 48 83 c1 01 add 0ドルx1,%rcx
20: 48 81 fa 00 01 00 00 cmp 0ドルx100,%rdx
27: 75 dc jne 5 <compression_encode_prepare1+0x5>
29: c3 retq
2a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
*/
void compression_encode_prepare1 (struct compression* c)
{
for (uint64_t j = 0; j < (1 << 24); j++)
for (uint64_t i = 0; i < 256; i++)
{
c->data_t[i] = i;
}
}
void compression_encode_prepare2 (struct compression* c)
{
for (uint64_t j = 0; j < (1 << 24); j++)
asm volatile \
( \
" lea %0 , %%rdi \n" \
" lea %1 , %%rsi \n" \
" mov 0ドルx20, %%rcx \n" \
" rep movsq \n" \
: "=m" (c->data_t) \
: "m" ( data_s) \
: "%rcx", "%rsi", "%rdi" \
);
}
#ifndef __COMPRESSION_H__
#define __COMPRESSION_H__
#include <stdint.h>
struct compression
{
uint8_t data_t[256]; // compression tree indices
};
extern void compression_encode_prepare1 (struct compression* c);
extern void compression_encode_prepare2 (struct compression* c);
#endif
#include <stdint.h>
#include <stdio.h>
#include <time.h>
#include "compression.h"
int main()
{
clock_t tic, toc;
struct compression c;
tic = clock();
compression_encode_prepare1 (&c);
toc = clock();
for (uint64_t i = 0; i < 256; i++) printf("%02hhX ", c.data_t[i]); printf("\n");
printf("elapsed compression & encryption: %fs\n", (double) (toc - tic) / (double) CLOCKS_PER_SEC);
tic = clock();
compression_encode_prepare2 (&c);
toc = clock();
for (uint64_t i = 0; i < 256; i++) printf("%02hhX ", c.data_t[i]); printf("\n");
printf("elapsed compression & encryption: %fs\n", (double) (toc - tic) / (double) CLOCKS_PER_SEC);
}
all:
gcc -Ofast -c -g -o compression.o compression.c
gcc -Ofast -c -g -o main.o main.c
gcc -Ofast -g -o main main.o compression.o