Converting Between Bit Depth (ASM gurus click here)

Converting Between Bit Depth (ASM gurus click here)

ImLeftFooted

Member #3,935

October 2003

I wrote a routine that will convert between any whole integer bit depth from 1 to 24. The routine works without flaw...

But it is slow . I'm looking for help from a guru.

The target is the iPhone ARM processor.

#SelectExpand
   1void convertImage(int srcBpp, unsigned char *src, int srcSize,
           int destBpp, unsigned char *dest, int destSize,
           int lineStart, int destLineWidth, int linePadding)
   4{
PIXEL_PACK_TABLE(Src, srcBpp)
PIXEL_PACK_TABLE(Dest, destBpp)

if(srcSize < 1 || destSize < 1)
  return;

if(destLineWidth < 1)
  destLineWidth = 1;

if(srcBpp < 6 && destBpp > 5) {
  
  convertImageBW2COL(srcBpp, src, srcSize, destBpp, dest, destSize, lineStart, destLineWidth, linePadding);
  return;
}
else if(srcBpp > 5 && destBpp < 6) {
  
  //convertImageCOL2BW(srcBpp, src, srcSize, destBpp, dest, destSize, lineStart, destLineWidth, linePadding);
  //return;
}
else if(srcBpp < 6 && destBpp < 6) {
  
  //convertImageBW2BW(srcBpp, src, srcSize, destBpp, dest, destSize, lineStart, destLineWidth, linePadding);
  //return;
}

// Max unsigned char number for convience.
const unsigned char max = 0xff;

// current color index
// 0 is red, 1 is green, 2 is blue
int col = 0;

// number of bits from src and dest.
int srcOffset = 0;
int destOffset = 0;

// dest index
int destI = 0;

unsigned char byte = 0;
int srcCol;

// srcI is src index
for(int srcI = 0; ; ) {
  
  // TODO fix all the bit shifts to cast back to unsigned char correctly.
  // do the same on the server.
  
  srcCol =
  // Put the color component against the left wall
  (unsigned char)( ( (unsigned char)((src[srcI] << srcOffset) & 0xffu)
            
            // Now move it over to the right wall
            >> (8 - sizesSrc[col])
            ) & 0xffu );
  
  srcCol |=
  // Move bits that were cut off before to the right edge
  // and binary or them with srcCol.
  (unsigned char)( (src[srcI + 1] >> (8 - ((sizesSrc[col] + srcOffset) - 8))) & 0xffu
          );
  /*
   if(sizesSrc[col] + srcOffset >= 8)
   ++srcI;
   */
  srcI += (((sizesSrc[col] + srcOffset) & 8) >> 3);
  
  if(srcI >= srcSize)
    break;
  
  byte |=
  // We multiply color by new max value and divide by old max value.
  // This converts src[srcI] to the new bpp
  (unsigned char)( ( (unsigned char)( ( (unsigned char)(int(srcCol) * MAX_FOR_COLOR(sizesDest[col]) / MAX_FOR_COLOR(sizesSrc[col]))
                     // We push the data to the left of the byte, making room for
                     // the next pixel inside the same byte.
                     << (8 - sizesDest[col])
                     )  & 0xffu )
            // Now we shift _right_ in case another pixel is already using this part of this byte.
            >> destOffset
            ) & 0xffu );
  
  // If this color component fills up the byte...
  if(sizesDest[col] + destOffset >= 8) {
    
    if(destI >= destSize)
      break;
    
    dest[destI] = byte;
    byte = 0;
    
    // Move to the next byte
    destI++;
    
    if(0 == (((destI + linePadding + lineStart)) % (destLineWidth))) {
      
      destI += linePadding;
    }
    
    // Here we do the same as before, except...
    byte |=
    (unsigned char)( ( (unsigned char)( ( (unsigned char)(int(src[srcI]) * MAX_FOR_COLOR(sizesDest[col]) / MAX_FOR_COLOR(sizesSrc[col]))
                       << (8 - sizesDest[col])
                       )  & 0xffu )
              // ...now we move _left_ to get only the piece that was cut off in the first operation
              // the piece will be stuffed in the left part of the byte.
              << (8 - destOffset)
              ) & 0xffu );
  }
  
  // Increment offset by how many bits we've inserted
  // Decrement offset if offset >= 8
  destOffset += sizesDest[col];
  destOffset = destOffset % 8;
  
  // Increment offset by how many bits we've read
  // Increment srcI if we've gone past our 8 bits
  // Decrement offset if offset >= 8
  srcOffset += sizesSrc[col];
  srcOffset = srcOffset % 8;
  
  // We move on to the next color component, wrapping back to 
  // the first one if needed.
  col = ++col % 3;
}
 130}

Here is the macro definition.

#SelectExpand
  1// The biggest number a color component can contain.
  2#define MAX_FOR_COLOR(bpp) (unsigned char)(~(max << (bpp)) & 0xff)
  3
  4/* Begin pixel packing code */
  5// Our variable encoding scheme accepts all whole integers from
  6// 24 bpp to 1 bpp.  This is a table of what each color component
  7// is for each color depth.
  8#define PIXEL_PACK_TABLE(name, bpp) \
  9int sizes##name[3]; \ 
 10int &rSize##name = sizes##name[0]; \ 
 11int &gSize##name = sizes##name[1]; \ 
 12int &bSize##name = sizes##name[2]; \ 
 13switch(bpp) { \ 
 14case 24: \ 
 15rSize##name = 8, gSize##name = 8, bSize##name = 8; \ 
 16break; \ 
 17case 23: \ 
 18rSize##name = 7, gSize##name = 8, bSize##name = 8; \ 
 19break; \ 
 20case 22: \ 
 21rSize##name = 7, gSize##name = 7, bSize##name = 8; \ 
 22break; \ 
 23case 21: \ 
 24rSize##name = 7, gSize##name = 7, bSize##name = 7; \ 
 25break; \ 
 26case 20: \ 
 27rSize##name = 6, gSize##name = 7, bSize##name = 7; \ 
 28break; \ 
 29case 19: \ 
 30rSize##name = 6, gSize##name = 6, bSize##name = 7; \ 
 31break; \ 
 32case 18: \ 
 33rSize##name = 6, gSize##name = 6, bSize##name = 6; \ 
 34break; \ 
 35case 17: \ 
 36rSize##name = 5, gSize##name = 6, bSize##name = 6; \ 
 37break; \ 
 38case 16: \ 
 39rSize##name = 5, gSize##name = 5, bSize##name = 6; \ 
 40break; \ 
 41case 15: \ 
 42rSize##name = 5, gSize##name = 5, bSize##name = 5; \ 
 43break; \ 
 44case 14: \ 
 45rSize##name = 4, gSize##name = 5, bSize##name = 5; \ 
 46break; \ 
 47case 13: \ 
 48rSize##name = 4, gSize##name = 4, bSize##name = 5; \ 
 49break; \ 
 50case 12: \ 
 51rSize##name = 4, gSize##name = 4, bSize##name = 4; \ 
 52break; \ 
 53case 11: \ 
 54rSize##name = 3, gSize##name = 4, bSize##name = 4; \ 
 55break; \ 
 56case 10: \ 
 57rSize##name = 3, gSize##name = 3, bSize##name = 4; \ 
 58break; \ 
 59case 9: \ 
 60rSize##name = 3, gSize##name = 3, bSize##name = 3; \ 
 61break; \ 
 62case 8: \ 
 63rSize##name = 2, gSize##name = 3, bSize##name = 3; \ 
 64break; \ 
 65case 7: \ 
 66rSize##name = 2, gSize##name = 2, bSize##name = 3; \ 
 67break; \ 
 68case 6: \ 
 69rSize##name = 2, gSize##name = 2, bSize##name = 2; \ 
 70break; \ 
 71case 5: \ 
 72rSize##name = 1, gSize##name = 2, bSize##name = 2; \ 
 73break; \ 
 74case 4: \ 
 75rSize##name = 1, gSize##name = 1, bSize##name = 2; \ 
 76break; \ 
 77case 3: \ 
 78rSize##name = 1, gSize##name = 1, bSize##name = 1; \ 
 79break; \ 
 80case 2: \ 
 81rSize##name = 0, gSize##name = 1, bSize##name = 1; \ 
 82break; \ 
 83case 1: \ 
 84rSize##name = 0, gSize##name = 0, bSize##name = 1; \ 
 85break; \ 
 86} 

edit

Here is the final solution.

#SelectExpand
   1int convertImage(int srcBpp, unsigned char *src, int srcSize,
       int destBpp, unsigned char *dest, int destSize,
       int lineStart, int destLineWidth, int linePadding)
   4{
PIXEL_PACK_TABLE(Src, srcBpp)
PIXEL_PACK_TABLE(Dest, destBpp)

// current color index
// 0 is red, 1 is green, 2 is blue
int col = 0;

// number of bits from src and dest.
int srcOffset = 0;
int destOffset = 0;

// dest index
int destI = 0;

unsigned char byte = 0;
int srcCol;

int destLineWidthIncremented = destLineWidth;
int srcI = 0;

int diff1[3] = 
{
  sizesDest[0] - sizesSrc[0],
  sizesDest[1] - sizesSrc[1],
  sizesDest[2] - sizesSrc[2]
};

int diff2[3] =
{
  8 - (sizesDest[0] - sizesSrc[0]),
  8 - (sizesDest[1] - sizesSrc[1]),
  8 - (sizesDest[2] - sizesSrc[2]),
};

for(int col = 0; col < 3; col++) {
  
  if(sizesSrc[col] > sizesDest[col]) {
    
    swap(diff1[col], diff2[col]);
    
    diff1[col] = -diff1[col];
    diff2[col] = -diff2[col];
  }
}

// srcI is src index
for( ;; ) {
  
  // TODO fix all the bit shifts to cast back to unsigned char correctly.
  // do the same on the server.
  
  srcCol = 
  
  // Put the color component against the left wall
  (unsigned char)( ( (unsigned char)((src[srcI] << srcOffset) & 0xffu)
            
            // Now move it over to the right wall
            >> (8 - sizesSrc[col])
            ) & 0xffu );
  
  srcCol |=
  // Move bits that were cut off before to the right edge
  // and binary or them with srcCol.
  (unsigned char)( (src[srcI + 1] >> (8 - ((sizesSrc[col] + srcOffset) - 8))) & 0xffu
          );
  /*
   if(sizesSrc[col] + srcOffset >= 8)
   ++srcI;
   */
  srcI += (((sizesSrc[col] + srcOffset) & 8) >> 3);
  
  if(srcI >= srcSize || destI >= destSize)
    break;
  
  byte |=
  // We multiply color by new max value and divide by old max value.
  // This converts src[srcI] to the new bpp
  (unsigned char)( ( (unsigned char)( ( (unsigned char)( ((srcCol << (diff1[col])) & 0xffu) | ((srcCol >> (diff2[col])) & 0xffu ) )
                     // We push the data to the left of the byte, making room for
                     // the next pixel inside the same byte.
                     << (8 - sizesDest[col])
                     )  & 0xffu )
            // Now we shift _right_ in case another pixel is already using this part of this byte.
            >> destOffset
            ) & 0xffu );
  
  // If this color component fills up the byte...
  if(sizesDest[col] + destOffset >= 8) {
    
    dest[destI] = byte;
    byte = 0;
    
    // Move to the next byte
    destI++;
    
    if(destI + linePadding + lineStart >= destLineWidthIncremented) {
      
      destI += linePadding;
      destLineWidthIncremented += destLineWidth;
    }
    
    // Here we do the same as before, except...
    byte |=
    (unsigned char)( ( (unsigned char)( ( (unsigned char)( ((srcCol << (diff1[col])) & 0xffu) | ((srcCol >> (diff2[col])) & 0xffu ) )
                       << (8 - sizesDest[col])
                       )  & 0xffu )
              // ...now we move _left_ to get only the piece that was cut off in the first operation
              // the piece will be stuffed in the left part of the byte.
              << (8 - destOffset)
              ) & 0xffu );
  }
  
  // Increment offset by how many bits we've inserted
  // Decrement offset if offset >= 8
  destOffset += sizesDest[col];
  destOffset = (destOffset < 8 ? destOffset : destOffset - 8);
  
  // Increment offset by how many bits we've read
  // Increment srcI if we've gone past our 8 bits
  // Decrement offset if offset >= 8
  srcOffset += sizesSrc[col];
  srcOffset = (srcOffset < 8 ? srcOffset : srcOffset - 8);
  
  // We move on to the next color component, wrapping back to 
  // the first one if needed.
  col = (col+1 < 3 ? col+1 : 0);
}

return min(destSize, destI);
 134}

How is my posting?

Audric

Member #907

January 2001

Can you compare the performances in a "trivial" case where you would hard-code the shifts to work just for this case (ex: 888 to 666)
If the improvement is not noticeable, it means the code inside this function is as good as can be.

Bob

Free Market Evangelist

September 2000

Those divides can't be good for performance. You can replace the multiply, then divide sequences into bitwise ANDs since you know that color components always occupy an integer number of bits.

There are also quality issues with this converter. For example, converting pure white from 565 into 888 will not result in pure white, but an "almost white" shade of gray.

You'll want to write something like:

    new_color = (old_color << shift) | (old_color >> (8 - shift)).

Or something like that.

Finally, you should change the routine to perform the conversion on 8, 16 or 32-bit types natively instead of always working on 1 byte at a time. That will get rid of most of the if statements.

--
- Bob
[ -- All my signature links are 404 -- ]

ImLeftFooted

Member #3,935

October 2003

#SelectExpand
   1int convertImage(int srcBpp, unsigned char *src, int srcSize,
       int destBpp, unsigned char *dest, int destSize,
       int lineStart, int destLineWidth, int linePadding)
   4{
PIXEL_PACK_TABLE(Src, srcBpp)
PIXEL_PACK_TABLE(Dest, destBpp)

if(srcBpp < 6 && destBpp > 5) {
  
  convertImageBW2COL(srcBpp, src, srcSize, destBpp, dest, destSize, lineStart, destLineWidth, linePadding);
  return 0;
}
else if(srcBpp > 5 && destBpp < 6) {
  
  //convertImageCOL2BW(srcBpp, src, srcSize, destBpp, dest, destSize, lineStart, destLineWidth, linePadding);
  //return;
}
else if(srcBpp < 6 && destBpp < 6) {
  
  //convertImageBW2BW(srcBpp, src, srcSize, destBpp, dest, destSize, lineStart, destLineWidth, linePadding);
  //return;
}

// Max unsigned char number for convience.
const unsigned char max = 0xff;

// current color index
// 0 is red, 1 is green, 2 is blue
int col = 0;

// number of bits from src and dest.
int srcOffset = 0;
int destOffset = 0;

// dest index
int destI = 0;

unsigned char byte = 0;
int srcCol;

//[03:10] <Planck_> One way to avoid division might be to do a fixed-point multiply
//[03:12] <Planck_> Instead of x * A / B, precalculate (say) C = 256*A/B and compute (x*C) >> 8.
//[03:14] <Planck_> Since it looks like there are are 3 values of C you need here.

//[03:24] <Planck_> To minimize the rounding problem, you can shift C upward 8 bits.  So it's like a floating point number with 8 bits of precision
//[03:24] <Planck_> You just need to make sure to shift it down again after you multiply
//[03:24] <ddustin> hmm
//[03:25] <Planck_> So when you calculate  (srcCol * C) >> 8,  it's really  srcCol * (256 * A / B) / 256.
//[03:26] <Planck_> So the maximum error is 1/256th
//[03:26] <Planck_> 1/256th of srcCol, I mean.

int C[3] =
{
  0xff * MAX_FOR_COLOR(sizesDest[0]) / MAX_FOR_COLOR(sizesSrc[0]),
  0xff * MAX_FOR_COLOR(sizesDest[1]) / MAX_FOR_COLOR(sizesSrc[1]),
  0xff * MAX_FOR_COLOR(sizesDest[2]) / MAX_FOR_COLOR(sizesSrc[2])
};

int destLineWidthIncremented = destLineWidth;
int srcI = 0;

// srcI is src index
for( ;; ) {
  
  // TODO fix all the bit shifts to cast back to unsigned char correctly.
  // do the same on the server.
  
  srcCol =
  // Put the color component against the left wall
  (unsigned char)( ( (unsigned char)((src[srcI] << srcOffset) & 0xffu)
            
            // Now move it over to the right wall
            >> (8 - sizesSrc[col])
            ) & 0xffu );
  
  srcCol |=
  // Move bits that were cut off before to the right edge
  // and binary or them with srcCol.
  (unsigned char)( (src[srcI + 1] >> (8 - ((sizesSrc[col] + srcOffset) - 8))) & 0xffu
          );
  /*
   if(sizesSrc[col] + srcOffset >= 8)
   ++srcI;
   */
  srcI += (((sizesSrc[col] + srcOffset) & 8) >> 3);
  
  if(srcI >= srcSize)
    break;
  
  byte |=
  // We multiply color by new max value and divide by old max value.
  // This converts src[srcI] to the new bpp
  (unsigned char)( ( (unsigned char)( ( (unsigned char)((srcCol * C[col]) >> 8)
                     // We push the data to the left of the byte, making room for
                     // the next pixel inside the same byte.
                     << (8 - sizesDest[col])
                     )  & 0xffu )
            // Now we shift _right_ in case another pixel is already using this part of this byte.
            >> destOffset
            ) & 0xffu );
  
  // If this color component fills up the byte...
  if(sizesDest[col] + destOffset >= 8) {
    
    if(destI >= destSize)
      break;
    
    dest[destI] = byte;
    byte = 0;
    
    // Move to the next byte
    destI++;
    
    if(destI + linePadding + lineStart >= destLineWidthIncremented) {
      
      destI += linePadding;
      destLineWidthIncremented += destLineWidth;
    }
    
    // Here we do the same as before, except...
    byte |=
    (unsigned char)( ( (unsigned char)( ( (unsigned char)((srcCol * C[col]) >> 8)
                       << (8 - sizesDest[col])
                       )  & 0xffu )
              // ...now we move _left_ to get only the piece that was cut off in the first operation
              // the piece will be stuffed in the left part of the byte.
              << (8 - destOffset)
              ) & 0xffu );
  }
  
  // Increment offset by how many bits we've inserted
  // Decrement offset if offset >= 8
  destOffset += sizesDest[col];
  destOffset = (destOffset < 8 ? destOffset : destOffset - 8);
  
  // Increment offset by how many bits we've read
  // Increment srcI if we've gone past our 8 bits
  // Decrement offset if offset >= 8
  srcOffset += sizesSrc[col];
  srcOffset = (srcOffset < 8 ? srcOffset : srcOffset - 8);
  
  // We move on to the next color component, wrapping back to 
  // the first one if needed.
  col = (col+1 < 3 ? col+1 : 0);
}

return min(destSize, destI);
 148}

Here is the implementation I'm using now.

Is this faster than new_color = (old_color << shift) | (old_color >> (8 - shift))?

Quote:

Finally, you should change the routine to perform the conversion on 8, 16 or 32-bit types natively instead of always working on 1 byte at a time.

The way the function is used it doesn't get a large advantage from that.

Quote:

That will get rid of most of the if statements.

Apparently ARM has 'conditional instructions' which leads me to believe if statements with a single instruction inside are optimized. I noted replacing col = ++col % 3 with a branch had a decent improvement in performance.

edit created destLineWidthIncremented variable to remove a multiplication.
edit What should 'shift' be?

I cant quite work out how that works.

10000001 should become 0111. We're binary ORing only twice from two digits and must produce three digits.

If shift is 3 we get

10000001 << 3 | 10000001 >> 5
00001000      | 00000100

00001100

If shift is 2 we get

10000001 << 2 | 10000001 >> 6
00000100      | 00000010

00000110

Hm, lets try with gray

10000000 << 4 | 10000000 >> 4
00001000      | 00000000

00001000

Okay this is correct but why use '4'?

Maybe this is oldBpp - newBpp?

Lets try 7 -> 5 bpp

1000001 << 2 | 1000001 >> 6
0000100      | 0000001

0000101

~~Holy crap that works! I don't really get why though...~~ Ah spoke too soon.

Lets try reversing it

1000001 >> 2 | 1000001 << 6
0010000      | 1000000

1010000 & 0011111 =
0010000

This is close but doesn't feel right. Lets make it a bit more white.

1000010 >> 2 | 1000010 << 6
0010000      | 0000000

0010000

Hm....

1000100 >> 2 | 1000100 << 6
0010001      | 0000000

0010001

Yes this would appear correct...
I don't understand the theory behind this though.

edit
I can't get it to work.
Here is how I modified the code. The result looks like it's just black.

    byte |=
    // We multiply color by new max value and divide by old max value.
    // This converts src[srcI] to the new bpp
    (unsigned char)( ( (unsigned char)( ( (unsigned char)( ((srcCol << (sizesSrc[col] - sizesDest[col])) & 0xffu) | ((srcCol >> (8 - (sizesSrc[col] - sizesDest[col]))) & 0xffu ) )
                       // We push the data to the left of the byte, making room for
                       // the next pixel inside the same byte.
                       << (8 - sizesDest[col])
                       )  & 0xffu )
              // Now we shift _right_ in case another pixel is already using this part of this byte.
              >> destOffset
              ) & 0xffu );

edit, I just flipped shift directions a bunch until it started working. Now it works and has nothing but bitwise and addition! Woo!

I still wouldn't mind an explanation on this trick . Here is the code as it is now.

#SelectExpand
   1int convertImage(int srcBpp, unsigned char *src, int srcSize,
       int destBpp, unsigned char *dest, int destSize,
       int lineStart, int destLineWidth, int linePadding)
   4{
PIXEL_PACK_TABLE(Src, srcBpp)
PIXEL_PACK_TABLE(Dest, destBpp)

if(srcBpp < 6 && destBpp > 5) {
  
  convertImageBW2COL(srcBpp, src, srcSize, destBpp, dest, destSize, lineStart, destLineWidth, linePadding);
  return 0;
}
else if(srcBpp > 5 && destBpp < 6) {
  
  //convertImageCOL2BW(srcBpp, src, srcSize, destBpp, dest, destSize, lineStart, destLineWidth, linePadding);
  //return;
}
else if(srcBpp < 6 && destBpp < 6) {
  
  //convertImageBW2BW(srcBpp, src, srcSize, destBpp, dest, destSize, lineStart, destLineWidth, linePadding);
  //return;
}

// Max unsigned char number for convience.
const unsigned char max = 0xff;

(void)max;

// current color index
// 0 is red, 1 is green, 2 is blue
int col = 0;

// number of bits from src and dest.
int srcOffset = 0;
int destOffset = 0;

// dest index
int destI = 0;

unsigned char byte = 0;
int srcCol;

int destLineWidthIncremented = destLineWidth;
int srcI = 0;

int diff1[3] = 
{
  sizesDest[0] - sizesSrc[0],
  sizesDest[1] - sizesSrc[1],
  sizesDest[2] - sizesSrc[2]
};

int diff2[3] =
{
  8 - (sizesDest[0] - sizesSrc[0]),
  8 - (sizesDest[1] - sizesSrc[1]),
  8 - (sizesDest[2] - sizesSrc[2]),
};

for(int col = 0; col < 3; col++) {
  
  if(sizesSrc[col] > sizesDest[col]) {
    
    swap(diff1[col], diff2[col]);
    
    diff1[col] = -diff1[col];
    diff2[col] = -diff2[col];
  }
}

// srcI is src index
for( ;; ) {
  
  // TODO fix all the bit shifts to cast back to unsigned char correctly.
  // do the same on the server.
  
  srcCol = 
  
  // Put the color component against the left wall
  (unsigned char)( ( (unsigned char)((src[srcI] << srcOffset) & 0xffu)
            
            // Now move it over to the right wall
            >> (8 - sizesSrc[col])
            ) & 0xffu );
  
  srcCol |=
  // Move bits that were cut off before to the right edge
  // and binary or them with srcCol.
  (unsigned char)( (src[srcI + 1] >> (8 - ((sizesSrc[col] + srcOffset) - 8))) & 0xffu
          );
  /*
   if(sizesSrc[col] + srcOffset >= 8)
   ++srcI;
   */
  srcI += (((sizesSrc[col] + srcOffset) & 8) >> 3);
  
  if(srcI >= srcSize || destI >= destSize)
    break;
  
  byte |=
  // We multiply color by new max value and divide by old max value.
  // This converts src[srcI] to the new bpp
  (unsigned char)( ( (unsigned char)( ( (unsigned char)( ((srcCol << (diff1[col])) & 0xffu) | ((srcCol >> (diff2[col])) & 0xffu ) )
                     // We push the data to the left of the byte, making room for
                     // the next pixel inside the same byte.
                     << (8 - sizesDest[col])
                     )  & 0xffu )
            // Now we shift _right_ in case another pixel is already using this part of this byte.
            >> destOffset
            ) & 0xffu );
  
  // If this color component fills up the byte...
  if(sizesDest[col] + destOffset >= 8) {
    
    dest[destI] = byte;
    byte = 0;
    
    // Move to the next byte
    destI++;
    
    if(destI + linePadding + lineStart >= destLineWidthIncremented) {
      
      destI += linePadding;
      destLineWidthIncremented += destLineWidth;
    }
    
    // Here we do the same as before, except...
    byte |=
    (unsigned char)( ( (unsigned char)( ( (unsigned char)( ((srcCol << (diff1[col])) & 0xffu) | ((srcCol >> (diff2[col])) & 0xffu ) )
                       << (8 - sizesDest[col])
                       )  & 0xffu )
              // ...now we move _left_ to get only the piece that was cut off in the first operation
              // the piece will be stuffed in the left part of the byte.
              << (8 - destOffset)
              ) & 0xffu );
  }
  
  // Increment offset by how many bits we've inserted
  // Decrement offset if offset >= 8
  destOffset += sizesDest[col];
  destOffset = (destOffset < 8 ? destOffset : destOffset - 8);
  
  // Increment offset by how many bits we've read
  // Increment srcI if we've gone past our 8 bits
  // Decrement offset if offset >= 8
  srcOffset += sizesSrc[col];
  srcOffset = (srcOffset < 8 ? srcOffset : srcOffset - 8);
  
  // We move on to the next color component, wrapping back to 
  // the first one if needed.
  col = (col+1 < 3 ? col+1 : 0);
}

return min(destSize, destI);
 155}

edit The code only worked for converting 'up' bpp (ie 16 -> 24), now the code works for both ways.

How is my posting?

GullRaDriel

Member #3,861

September 2003

Is it working faster now ?

"Code is like shit - it only smells if it is not yours"
Allegro Wiki, full of examples and articles !!

ImLeftFooted

Member #3,935

October 2003

Yes much faster. I would be happy to make it even faster though -- if anyone knows of some ASM tricks I could pull.

How is my posting?

GullRaDriel

Member #3,861

September 2003

Why do you keep empty if statements, and a call to (void)max; ?

"Code is like shit - it only smells if it is not yours"
Allegro Wiki, full of examples and articles !!

Vanneto

Member #8,643

May 2007

(void)max; prevents the compiler from spitting out "unused variable" warning. Im almost certain that gets optimized down to nothing - the same with empty if-statements.

In capitalist America bank robs you.

ImLeftFooted

Member #3,935

October 2003

Also they only happen once per function call. The real need-for-speed is inside the loop.

How is my posting?