Bit-twiddling for a custom image format

Question 1

The following code is mainly concerned with the conversion from byte/bit stream to numericals such as int, unsigned int, short int, unsigned short int, float, and etc. These functions are heavily used in my image data decoding program. I feel an insufferably slow when dealing with bigger image. Would please help me improve my code? Thank you!

const unsigned int POW2[32] = 
{1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,
1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728,268435456,536870912,
1073741824,2147483648 };
// Number of bits per word
enum BitCount
{ 
BC01=1, BC02=2, BC03=3, BC04=4, BC05=5, BC06=6, BC07=7, BC08=8, BC09=9, BC10=10,
BC11=11,BC12=12,BC13=13,BC14=14,BC15=15,BC16=16,BC17=17,BC18=18,BC19=19,BC20=20,
BC21=21,BC22=22,BC23=23,BC24=24,BC25=25,BC26=26,BC27=27,BC28=28,BC29=29,BC30=30,
BC31=31,BC32=32 };
// Bit position in one byte (8bit)
enum BitPosition
{ 
BP0=0, BP1=1, BP2=2, BP3=3, BP4=4, BP5=5, BP6=6, BP7=7 
};
unsigned int GetNbitsWordBE(unsigned int indexWord, const unsigned char * p, BitCount bitCount)
{ 
// Subscript numbering from 0
unsigned int offsetBits = indexWord * bitCount;
unsigned int beginByte = offsetBits / 8;
BitPosition beginBit = static_cast<BitPosition>(offsetBits % 8); 
return GetNbitsWordBE(beginByte, beginBit, p, bitCount);
}
unsigned int GetNbitsWordBE(unsigned int beginByte, BitPosition beginBit, const unsigned char * p, BitCount bitCount)
{ 
unsigned int result = 0;
const unsigned char * data = p + beginByte;
// Branch #1: special case (take up the integral number of bytes)
if (beginBit==BP0)
{ 
 if (bitCount == BC08)
 return static_cast<unsigned int>(data[0]);
 else if (bitCount == BC16)
 return static_cast<unsigned int>(data[1]) + static_cast<unsigned int>(data[0]) * POW2[8];
 else if (bitCount == BC24)
 return static_cast<unsigned int>(data[2]) + static_cast<unsigned int>(data[1]) * POW2[8] + static_cast<unsigned int>(data[0]) * POW2[16];
 else if (bitCount == BC32)
 return static_cast<unsigned int>(data[3]) + static_cast<unsigned int>(data[2]) * POW2[8] + static_cast<unsigned int>(data[1]) * POW2[16] + static_cast<unsigned int>(data[0]) * POW2[24];
 else
 ; // Do nothing
}
// Compute as big as 32-bit number.
// 32-bit number can occupy as much as 5 bytes because it's highly possible 
// that the first bit is not the first bit of the first byte.
unsigned int arRadio[5] = {0,0,0,0,0};
unsigned int arValue[5] = {0,0,0,0,0};
unsigned int occupiedBytesCount = static_cast<unsigned int>( ceil(static_cast<double>(beginBit + bitCount) / 8.0) );
BitPosition beginPos = beginBit;
int idxEnd = (beginBit + bitCount) % 8 - 1; 
if (idxEnd<0) 
 idxEnd=7;
BitPosition endPos = static_cast<BitPosition>(idxEnd);
switch(occupiedBytesCount)
{ 
 case 1:
 // 1st byte
 result = GetNbitsWordBE(beginPos,endPos,data[0]);
 break;
 case 2:
 // 2nd byte
 arValue[1] = GetNbitsWordBE(BP0,endPos,data[1]);
 arRadio[1] = 1; 
 // 1st byte
 arValue[0] = GetNbitsWordBE(beginPos,BP7,data[0]);
 arRadio[0] = POW2[endPos+1];
 result = arValue[0]*arRadio[0] + arValue[1]*arRadio[1];
 break;
 case 3:
 // 3rd byte
 arValue[2] = GetNbitsWordBE(BP0,endPos,data[2]);
 arRadio[2] = 1;
 // 2nd byte
 arValue[1] = data[1];
 arRadio[1] = POW2[endPos+1];
 // 1st byte
 arValue[0] = GetNbitsWordBE(beginPos,BP7,data[0]);
 arRadio[0] = POW2[endPos+1+8];
 result = arValue[0]*arRadio[0] + arValue[1]*arRadio[1] + arValue[2]*arRadio[2];
 break;
 case 4:
 // 4th byte
 arValue[3] = GetNbitsWordBE(BP0,endPos,data[3]);
 arRadio[3] = 1;
 // 3rd byte
 arValue[2] = data[2];
 arRadio[2] = POW2[endPos+1];
 // 2nd byte
 arValue[1] = data[1];
 arRadio[1] = POW2[endPos+1+8]; 
 // 1st byte
 arValue[0] = GetNbitsWordBE(beginPos,BP7,data[0]);
 arRadio[0] = POW2[endPos+1+8+8];
 result = arValue[0]*arRadio[0] + arValue[1]*arRadio[1] + arValue[2]*arRadio[2] + arValue[3]*arRadio[3];
 break;
 case 5:
 // 5th byte
 arValue[4] = GetNbitsWordBE(BP0,endPos,data[4]);
 arRadio[4] = 1;
 // 4th byte
 arValue[3] = data[3];
 arRadio[3] = POW2[endPos+1]; 
 // 3rd byte
 arValue[2] = data[2];
 arRadio[2] = POW2[endPos+1+8];
 // 2nd byte
 arValue[1] = data[1];
 arRadio[1] = POW2[endPos+1+8+8];
 // 1st byte
 arValue[0] = GetNbitsWordBE(beginPos,BP7,data[0]);
 arRadio[0] = POW2[endPos+1+8+8+8];
 result = arValue[0]*arRadio[0] + arValue[1]*arRadio[1] + arValue[2]*arRadio[2] + arValue[3]*arRadio[3] + arValue[4]*arRadio[4];
 break;
 default:
 break;
}
return result;
}
unsigned char GetNbitsWordBE(BitPosition beginPosition, BitPosition endPosition, const unsigned char & data)
{ 
if (beginPosition==BP0 && endPosition==BP7) 
 return data;
unsigned char result,ucTmp0,ucTmp1;
ucTmp0 = data;
ucTmp1 = static_cast<unsigned char>(ucTmp0<<beginPosition);
result = static_cast<unsigned char>(ucTmp1>>(beginPosition+(7-endPosition)));
return result;
}

Dear Victor T
Thank you for having taken time to do benchmarking job for my code. I use last-given functions in a function of my program. I post the consumer function as you requested for your reference as follows.

#define IR_5KM_SCANLINE_PIXELS 2291
#define VIS_1KM_SCANLINE_PIXELS 9164
#define SVS_IR_SCANLINE_SIZE 4582
#define SVS_VIS_SCANLINE_SIZE 9164
#define SVS_SCANLINE_SIZE 57301 
#define SVS_IR1_INDEX 2297 
#define SVS_IR2_INDEX 4592 
#define SVS_IR3_INDEX 6887 
#define SVS_IR4_INDEX 9182 
#define SVS_VIS1_INDEX 13766
#define SVS_VIS2_INDEX 22932
#define SVS_VIS3_INDEX 32098
#define SVS_VIS4_INDEX 41264
#define CSV_LINE_NO_SIZE 2
#define CSV_QC_ID_SIZE 1
#define CSV_DOC_ID_SIZE 2
#define CSV_DOC_SIZE 2291
#define CSV_IR_ID_SIZE 2
#define CSV_IR_SCANLINE_SIZE 2864
#define CSV_SIZE_VS_ID 2
#define CSV_VIS_SCANLINE_SIZE 6873
#define CSV_SCANLINE_SIZE 41260
#define CSV_DOC_INDEX 5 
#define CSV_IR1_INDEX 2298 
#define CSV_IR2_INDEX 5164 
#define CSV_IR3_INDEX 8030 
#define CSV_IR4_INDEX 10896
#define CSV_VIS1_INDEX 13762
#define CSV_VIS2_INDEX 20637
#define CSV_VIS3_INDEX 27512
#define CSV_VIS4_INDEX 34387
void ReadFileCSV(const char * strFile)
{ 
 m_pHeaderInfo = new CFileHeader();
 // Data buffers
 unsigned char* pSVSLine = new unsigned char [SVS_SCANLINE_SIZE];
 unsigned char* pCSVLine = new unsigned char [CSV_SCANLINE_SIZE];
 // Line counter
 unsigned short lineCount;
 FILE * fp = fopen(strFile, "rb");
 if (fp == NULL) 
 return;
 fseek(fp, 0, SEEK_END);
 long pos = ftell(fp);
 // Subtract the header info line (1 line)
 lineCount = static_cast<unsigned short>(pos / CSV_SCANLINE_SIZE) - 1;
 fseek(fp, 0, SEEK_SET);
 // Read the header info line
 fread(&m_pHeaderInfo->header, sizeof(CFileHeader::HEADER_INFO), 1, fp);
 char cArTmp0[128];
 strncpy(cArTmp0, m_pHeaderInfo->header.strRecordNum, 4);
 cArTmp0[4] = 0;
 int iTmp0;
 sscanf(cArTmp0, "%d", &iTmp0);
 if (iTmp0 < lineCount)
 lineCount = static_cast<unsigned short>(iTmp0);
 // Create 2D unsigned char array
 m_rawDoc.Create(lineCount, SVS_DOC_SIZE);
 // Create four 2D unsigned short arrays
 m_usIR1DataBuffer.Create(lineCount , IR_5KM_SCANLINE_PIXELS);
 m_usIR2DataBuffer.Create(lineCount , IR_5KM_SCANLINE_PIXELS);
 m_usIR3DataBuffer.Create(lineCount , IR_5KM_SCANLINE_PIXELS);
 m_usIR4DataBuffer.Create(lineCount , IR_5KM_SCANLINE_PIXELS);
 // Create a 2D unsigned char array
 m_ucVISDataBuffer.Create(lineCount * 4, VIS_1KM_SCANLINE_PIXELS);
 // Start to reading image data line
 for (unsigned int y = 0; y < lineCount; y++)
 { 
 fread(pCSVLine, 1, CSV_SCANLINE_SIZE, fp);
 memcpy(pSVSLine, pCSVLine + CSV_LINE_NO_SIZE + CSV_QC_ID_SIZE, CSV_DOC_ID_SIZE + CSV_DOC_SIZE);
 unsigned short *pUS1, *pUS2, *pUS3, *pUS4;
 pUS1 = reinterpret_cast<unsigned short *>(pSVSLine + SVS_IR1_INDEX);
 pUS2 = reinterpret_cast<unsigned short *>(pSVSLine + SVS_IR2_INDEX);
 pUS3 = reinterpret_cast<unsigned short *>(pSVSLine + SVS_IR3_INDEX);
 pUS4 = reinterpret_cast<unsigned short *>(pSVSLine + SVS_IR4_INDEX);
 for (int x = 0; x < IR_5KM_SCANLINE_PIXELS; x++)
 { 
 pUS1[x] = static_cast<short>(GetNbitsWordBE(x, pCSVLine + CSV_IR1_INDEX, BC10));
 pUS2[x] = static_cast<short>(GetNbitsWordBE(x, pCSVLine + CSV_IR2_INDEX, BC10));
 pUS3[x] = static_cast<short>(GetNbitsWordBE(x, pCSVLine + CSV_IR3_INDEX, BC10));
 pUS4[x] = static_cast<short>(GetNbitsWordBE(x, pCSVLine + CSV_IR4_INDEX, BC10));
 }
 unsigned char *pUC1, *pUC2, *pUC3, *pUC4;
 pUC1 = pSVSLine + SVS_VIS1_INDEX;
 pUC2 = pSVSLine + SVS_VIS2_INDEX;
 pUC3 = pSVSLine + SVS_VIS3_INDEX;
 pUC4 = pSVSLine + SVS_VIS4_INDEX;
 for (int x = 0; x < VIS_1KM_SCANLINE_PIXELS; x++)
 { 
 pUC1[x] = static_cast<unsigned char>(GetNbitsWordBE(x, pCSVLine + CSV_VIS1_INDEX, BC06));
 pUC2[x] = static_cast<unsigned char>(GetNbitsWordBE(x, pCSVLine + CSV_VIS2_INDEX, BC06));
 pUC3[x] = static_cast<unsigned char>(GetNbitsWordBE(x, pCSVLine + CSV_VIS3_INDEX, BC06));
 pUC4[x] = static_cast<unsigned char>(GetNbitsWordBE(x, pCSVLine + CSV_VIS4_INDEX, BC06));
 }
 memcpy(m_rawDoc.Ptr(y,0), pSVSLine + SVS_DOC_INDEX, SVS_DOC_SIZE);
 memcpy(m_usIR1DataBuffer.Ptr(y,0), pSVSLine + SVS_IR1_INDEX, SVS_IR_SCANLINE_SIZE);
 memcpy(m_usIR2DataBuffer.Ptr(y,0), pSVSLine + SVS_IR2_INDEX, SVS_IR_SCANLINE_SIZE);
 memcpy(m_usIR3DataBuffer.Ptr(y,0), pSVSLine + SVS_IR3_INDEX, SVS_IR_SCANLINE_SIZE);
 memcpy(m_usIR4DataBuffer.Ptr(y,0), pSVSLine + SVS_IR4_INDEX, SVS_IR_SCANLINE_SIZE);
 memcpy(m_ucVISDataBuffer.Ptr(4*y+0,0), pSVSLine + SVS_VIS1_INDEX, SVS_VIS_SCANLINE_SIZE);
 memcpy(m_ucVISDataBuffer.Ptr(4*y+1,0), pSVSLine + SVS_VIS2_INDEX, SVS_VIS_SCANLINE_SIZE);
 memcpy(m_ucVISDataBuffer.Ptr(4*y+2,0), pSVSLine + SVS_VIS3_INDEX, SVS_VIS_SCANLINE_SIZE);
 memcpy(m_ucVISDataBuffer.Ptr(4*y+3,0), pSVSLine + SVS_VIS4_INDEX, SVS_VIS_SCANLINE_SIZE);
 }
 fclose(fp);
}

Dear Mr. Martin,
Thank you for having taken time to read my code snippets. Please let me try to answer your questions to my code:
(1) Background explanation behind coding
The remotely sensed raw data image (for example satellite image) captured by the ground station are always packed in the form of 6-bit, 10-bit, or 12-bit streams. These data are stored scanline by scanline.

As you see from my attached code, ReadFileCSV is used to read this kind of scanline one by one form an opend binary file, and stored to an unsigned char source buffer for a while. For 6-bit stream, I must scale up to 8-bit (one byte), and for 10-bit or 12-bit stream, I will make them to be scaled unsigned short integer. Finally, I should get an unsigned char (for 6-bit) or unsigned short destination buffer.

(2) POW2 is the 2 raised to the power of n, where n can be set to any value from 0 to 32.


const unsigned int POW2[32] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288, 1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728,268435456,536870912, 1073741824,2147483648 };

As you may see, const unsigned int POW2[32] is just like a look-up table.

(3) I agree with your suggestion "avoid multiplying instead of shifting".

(4) I worry about the performance of three overloading functions (GetNbitsWordBE(....)) too. I'm not good at bit operations (for example, shifting, |, ^, ~, and etc.). I think bit-big-player would not use such a complicated and clumsy method to convert PC unfavorable bits such as 6-bit, 10-bit, 12-bit, to its nearest favorable bits such as 8-bit (unsigned char), 16-bit (unsigned short).

(5) Mr. Mark on SO suggest that I should use the following code snippets to scale my 6-bit and 10-bit values to their respective 8-bit (unsigned char) and 16-bit (unsigned short) values. As I cited here (BTW, I can't make it work):

The usual way is to left shift by the difference in the number of bits between the input and the output, then add in the high order bits from the input to fill out the lower bits.


unsigned char eightbit = (sixbit> 4); 
unsigned short sixteenbit = (tenbit> 4); 
unsigned short sixteenbit = (twelvebit> 8);

There's an alternate approach for the lower bits that I haven't seen very often - fill them with noise. This masks some of the quantization error in the original sample.


unsigned char eightbit = (sixbit> 14); 
unsigned short sixteenbit = (tenbit> 10); 
unsigned short sixteenbit = (twelvebit> 12);

(6) 10-bit stream illustartion

10101010 01110101 00011001
| |
-----------
 10-bit
10101010 01110101 00011001
 | |
 -----------
 10-bit
10101010 01110101 01100110
 | |
 ------------ 
 10-bit

Question 2

@Lee How does the rest of your program use these functions? Have you profiled your application and the measurements point to these functions to be the bottleneck? I ask because I benched your above code with 10million+ iterations and it seems fast enough. Everything completed < 1 second -- even faster if it was a power of two.

Question 3

Your formatting is broken on the second code sample, the #define lines are being interpreted as headings. Probably you need more indentation.

Question 4

Sorry if I'm misunderstanding the code, but here are some ideas of how it can be improved, focusing on performance. Those are the things that immediately came to my mind. I've probably missed things, and there may be higher level optimizations that can be done that I didn't see because I just took a quick look at the code.

What's the point of the POW2 array? It doesn't make the code any clearer than using the shift operator, and may make it harder for the compiler to find optimizations.

So instead of doing

something * POW2[16]

just do

something << 16

In general, avoid multiplying instead of shifting if you're working on the bit level anyway. Take for example case 4 which can be written simpler (and likely faster depending on how clever the compiler is) as:

case 4:
 // 4th byte
 arValue3 = GetNbitsWordBE(BP0, endPos, data[3]);
 arValue2 = data[2];
 arValue1 = data[1];
 arValue0 = GetNbitsWordBE(beginPos, BP7, data[0]);
 result = (arValue3) |
 (arValue2 << (endPos + 1)) |
 (arValue1 << (endPos + 1 + 8)) |
 (arValue0 << (endPos + 1 + 8 + 8);

I've made arValue into local variables instead of an array because it's easier to type. There may be a performance difference in some direction that you may want to profile.

I'd also worry about whether the GetNbitsWordBE function is inlined and optimized as well as you'd want it to. Oh, wait, there are three of it! Now I'm confused. I'd prefer not overloading it like that, but that's a style issue, not a performance problem.

Question 5

You're just trying to convert a lot of packed 10 bit integers into the typical C types?

Have you considered an approach like "round to the nearest 32-bit quantity, dereference that, and strip out the 'extra' bits"? Something like this:

uint16_t
get_packed_word(void *buf, int starting_bit, int nbits)
{
 uint32_t r;
 /* Copy 32 bits... (Via memcpy since otherwise we'd do unaligned access.) */
 memcpy(&r, ((char*)buf) + starting_bit/8, sizeof(r));
 /* Using htonl because I assume you want big endian... */
 r = htonl(r);
 /* Shift right to discard extra low bits... */
 r >>= ((32 - nbits) - (starting_bit % 8));
 /* Discard extra high bits... */
 r &= ((1<<nbits) - 1);
 return r;
}

This seems to work for your ASCII art test data with nbits = 10... No idea how it performs with large quantities of data [though bit shifting and AND is cheap, right?], but it's a lot simpler than what you have. I got this approach from the way some FAT drivers handle FAT12 (treat it like FAT16 with 16-bit dereferences, but truncate the extra fuzz).

Update: Looking at this again, my original code didn't handle some edge cases, such as starting_bit = 7... So I switched to 32-bit quantities rather than 16.

Martin Vilcans Martin Vilcans 3061 silver badge5 bronze badges · Answer 1 · 2011-06-17 22:59:25Z

Sorry if I'm misunderstanding the code, but here are some ideas of how it can be improved, focusing on performance. Those are the things that immediately came to my mind. I've probably missed things, and there may be higher level optimizations that can be done that I didn't see because I just took a quick look at the code.

What's the point of the POW2 array? It doesn't make the code any clearer than using the shift operator, and may make it harder for the compiler to find optimizations.

So instead of doing

something * POW2[16]

just do

something << 16

In general, avoid multiplying instead of shifting if you're working on the bit level anyway. Take for example case 4 which can be written simpler (and likely faster depending on how clever the compiler is) as:

case 4:
 // 4th byte
 arValue3 = GetNbitsWordBE(BP0, endPos, data[3]);
 arValue2 = data[2];
 arValue1 = data[1];
 arValue0 = GetNbitsWordBE(beginPos, BP7, data[0]);
 result = (arValue3) |
 (arValue2 << (endPos + 1)) |
 (arValue1 << (endPos + 1 + 8)) |
 (arValue0 << (endPos + 1 + 8 + 8);

I've made arValue into local variables instead of an array because it's easier to type. There may be a performance difference in some direction that you may want to profile.

I'd also worry about whether the GetNbitsWordBE function is inlined and optimized as well as you'd want it to. Oh, wait, there are three of it! Now I'm confused. I'd prefer not overloading it like that, but that's a style issue, not a performance problem.

asveikau asveikau 1,8029 silver badges8 bronze badges · Answer 2 · 2011-06-19 05:11:29Z

You're just trying to convert a lot of packed 10 bit integers into the typical C types?

Have you considered an approach like "round to the nearest 32-bit quantity, dereference that, and strip out the 'extra' bits"? Something like this:

uint16_t
get_packed_word(void *buf, int starting_bit, int nbits)
{
 uint32_t r;
 /* Copy 32 bits... (Via memcpy since otherwise we'd do unaligned access.) */
 memcpy(&r, ((char*)buf) + starting_bit/8, sizeof(r));
 /* Using htonl because I assume you want big endian... */
 r = htonl(r);
 /* Shift right to discard extra low bits... */
 r >>= ((32 - nbits) - (starting_bit % 8));
 /* Discard extra high bits... */
 r &= ((1<<nbits) - 1);
 return r;
}

This seems to work for your ASCII art test data with nbits = 10... No idea how it performs with large quantities of data [though bit shifting and AND is cheap, right?], but it's a lot simpler than what you have. I got this approach from the way some FAT drivers handle FAT12 (treat it like FAT16 with 16-bit dereferences, but truncate the extra fuzz).

Update: Looking at this again, my original code didn't handle some edge cases, such as starting_bit = 7... So I switched to 32-bit quantities rather than 16.

Stack Exchange Network

Bit-twiddling for a custom image format

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Bit-twiddling for a custom image format

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions