fast memcpy

gering

Hi, I have here a faster version of memcpy:

1typedef struct { unsigned char dummy [32]; } DT;
2void fastcpy(unsigned char* dst, unsigned char* src, size_t s) 
3{
       unsigned char* sa = src+s;
       DT *d1 = (DT*)dst - 1;
       DT *s1 = (DT*)src - 1;
       size_t si = s / sizeof(DT);
8 
       si = (si + 7) / 8;
       switch(si % 8) 
       {
               case 0: do { *++d1 = *++s1;
               case 7:      *++d1 = *++s1;
               case 6:      *++d1 = *++s1;
               case 5:      *++d1 = *++s1;
               case 4:      *++d1 = *++s1;
               case 3:      *++d1 = *++s1;
               case 2:      *++d1 = *++s1;
               case 1:      *++d1 = *++s1;
                           } while(--si > 0);
       }
22 
       dst = (unsigned char*)d1;
       src = (unsigned char*)s1;
       while(src < sa) 
       {
               *++dst = *++src;
       }
29}

What do you think about this code? Can you see any bugs or things I could change to do it faster?

miran

Uhm, if you're going to post something like that, the least you could do is make a speed comparison chart.

Tobi Vollebregt

does it compile? EDIT: it does

I've never seen a weird contruct like that with different parts of a loop in different case labels.

gering

Quote:

Uhm, if you're going to post something like that, the least you could do is make a speed comparison chart.

I know, I have made a benchmark, but forgot to take it home from work. Results were like 15% - 30% faster on Linux, MacOSX and WindowsXP - compiled with gcc. Also testet with 32Bit and 64Bit(AMD) CPU. Have to try msvc, too.

EDIT:
the MacOS version was the only one that was not slower than this one.

kazzmir

Quote:

I've never seen a weird contruct like that with different parts of a loop in different case labels.

Its called duffs device.

ReyBrujo

That is useful for masked blitting. As for your memcpy, try compiling with -O2 -ffast-math -fomit-frame-pointer -funroll-loops for maximun generic performance. I guess you can get it faster with assembler instructions, especially with MMX registers.

HoHo

Copying around cache would probably give faster results. Only problem that needs SSE2 IIRC. Memory aligning and other tricks can get memcpy speed up to somewhere around 80% of FSB theoretical maximum throughput.

[edit]

Also try copying between 32/64bit ints instead of chars. Speed should go up considerably. Using SIMD instinsic variables (128bit) would probably be even faster

ReyBrujo

Here is my old blit code. The MMX version is very fast, but assumes a lot (aligning, size, pointers, etc).

Evert

Fascinating. I wouldn't expect it to be faster than libc's memcpy.

1#include <string.h>
2#include <stdio.h>
3#include <time.h>
4#include <sys/time.h>
5#include <sys/resource.h>
6#include <sys/types.h>
7 
8#define CPUTIME (getrusage(RUSAGE_SELF,&ruse),\
9  ruse.ru_utime.tv_sec + ruse.ru_stime.tv_sec + \ 
10  1e-6 * (ruse.ru_utime.tv_usec + ruse.ru_stime.tv_usec)) 
11 
12struct rusage ruse;
13 
14extern int getrusage();
15 
16typedef struct { unsigned char dummy [32]; } DT;
17void fastcpy(unsigned char* dst, unsigned char* src, size_t s) 
18{
19         unsigned char* sa = src+s;
20         DT *d1 = (DT*)dst - 1;
21         DT *s1 = (DT*)src - 1;
22         size_t si = s / sizeof(DT);
23 
24         si = (si + 7) / 8;
25         switch(si % 8) 
26         {
27                 case 0: do { *++d1 = *++s1;
28                 case 7:      *++d1 = *++s1;
29                 case 6:      *++d1 = *++s1;
30                 case 5:      *++d1 = *++s1;
31                 case 4:      *++d1 = *++s1;
32                 case 3:      *++d1 = *++s1;
33                 case 2:      *++d1 = *++s1;
34                 case 1:      *++d1 = *++s1;
35                             } while(--si > 0);
36         }
37 
38         dst = (unsigned char*)d1;
39         src = (unsigned char*)s1;
40         while(src < sa) 
41         {
42                 *++dst = *++src;
43         }
44}
45 
46int main(void)
47{
48   char *s1, *s2, *s3;
49   unsigned int size;
50   double t0, t1;
51   
52   size = 256*1024*1024;
53   s1 = malloc(size);
54   s2 = malloc(size);
55   s3 = malloc(size);
56 
57   s1[size-1] = 1;
58   s2[size-1] = 2;
59   s3[size-1] = 3;
60  
61   t0 = CPUTIME;
62   memcpy(s1, s2, size);
63   t1 = CPUTIME;
64   printf("CPU time = %f secs.\n",t1-t0);
65   t0 = CPUTIME;
66   fastcpy(s1, s3, size);
67   t1 = CPUTIME;
68   printf("CPU time = %f secs.\n",t1-t0);
69}

Result:

Quote:

CPU time = 0.444931 secs.
CPU time = 0.278958 secs.

That's a factor of two (more or less). Maybe my test is flawed somehow? Or are computers really fast enough to copy around 512MB or RAM in a fraction of a second?

kazzmir

Evert, I dont see such improvements.

~/tmp $ !gcc
gcc s.c -o s -O3 -ffast-math -funroll-loops
s.c:11:49: warning: backslash and newline separated by space
~/tmp $ ./s
CPU time = 1.739736 secs.
CPU time = 1.604756 secs.
~/tmp $ ./s
CPU time = 1.719738 secs.
CPU time = 1.560763 secs.
~/tmp $

[edit]
In fact the following code shows memcpy is faster:

1#include <string.h>
2#include <stdio.h>
3#include <time.h>
4#include <sys/time.h>
5#include <sys/resource.h>
6#include <sys/types.h>
7 
8#define MAX 32
9#define TIMES 100
10 
11#define CPUTIME (getrusage(RUSAGE_SELF,&ruse),\
12  ruse.ru_utime.tv_sec + ruse.ru_stime.tv_sec + \  
13  1e-6 * (ruse.ru_utime.tv_usec + ruse.ru_stime.tv_usec)) 
14 
15struct rusage ruse;
16 
17extern int getrusage();
18 
19typedef struct { unsigned char dummy [32]; } DT;
20void fastcpy(unsigned char* dst, unsigned char* src, size_t s) 
21{
22         unsigned char* sa = src+s;
23         DT *d1 = (DT*)dst - 1;
24         DT *s1 = (DT*)src - 1;
25         size_t si = s / sizeof(DT);
26 
27         si = (si + 7) / 8;
28         switch(si % 8) 
29         {
30                 case 0: do { *++d1 = *++s1;
31                 case 7:      *++d1 = *++s1;
32                 case 6:      *++d1 = *++s1;
33                 case 5:      *++d1 = *++s1;
34                 case 4:      *++d1 = *++s1;
35                 case 3:      *++d1 = *++s1;
36                 case 2:      *++d1 = *++s1;
37                 case 1:      *++d1 = *++s1;
38                             } while(--si > 0);
39         }
40 
41         dst = (unsigned char*)d1;
42         src = (unsigned char*)s1;
43         while(src < sa) 
44         {
45                 *++dst = *++src;
46         }
47}
48 
49int main(void)
50{
51   char *s1, *s2, *s3;
52   unsigned int size;
53   double t0, t1;
54   unsigned int i;
55   
56   size = MAX*1024*1024;
57   s1 = malloc(size);
58   s2 = malloc(size);
59   // s3 = malloc(size);
60 
61   s1[size-1] = 1;
62   s2[size-1] = 2;
63   // s3[size-1] = 3;
64 
65   t0 = CPUTIME;
66  for ( i = 0; i < TIMES; i++ ){
67     memcpy(s1, s2, size);
68  }
69   t1 = CPUTIME;
70   printf("CPU time = %f secs.\n",t1-t0);
71   t0 = CPUTIME;
72   for ( i = 0; i < TIMES; i++ ){
73     fastcpy(s1, s2, size);
74   }
75   t1 = CPUTIME;
76   printf("CPU time = %f secs.\n",t1-t0);
77}

~/tmp $ !gcc
gcc s.c -o s -O3 -ffast-math -funroll-loops
s.c:12:49: warning: backslash and newline separated by space
~/tmp $ ./s
CPU time = 18.241228 secs.
CPU time = 19.291067 secs.

ReyBrujo

reybrujo@ansalon:~$ ./test
CPU time = 0.646902 secs.
CPU time = 0.415936 secs.

With -O3 there is a slight optimization, going down to 0.39xxxx.

For kazzmir test, I get

reybrujo@ansalon:~$ ./test2
CPU time = 4.733280 secs.
CPU time = 4.358337 secs.

HoHo

Quote:

Or are computers really fast enough to copy around 512MB or RAM in a fraction of a second?

If FSB is able to theoretically pass through ~6.4GiB of data I see no problem with copying 0.5G in a fraction of a second. Allthough I think 0.2s is a bit slow. But that's expectable from non optimized code

[edit]
Using code from here and combining it with Evert's test I got this result:
CPU time = 0.532032 secs. // memcpy
CPU time = 0.232016 secs. // fastcpy
CPU time = 0.168011 secs. // code from url

[edit2]
A little more statistics
CPU time = 0.496030 secs, 516.097817 MiB/s.
CPU time = 0.272017 secs, 941.117651 MiB/s.
CPU time = 0.188012 secs, 1361.615216 MiB/s.

kazzmir

This seems to be slightly faster than the parents 'fastcopy', although admittedly less portable to 64-bit machines. It is very readable though.

1#define copy1 \
*dest = *source;\ 
dest++;\ 
source++; 
5 
6#define copy2 \
*(unsigned short *)dest = *(unsigned short *)source;\ 
dest += 2;\ 
source += 2; 
10 
11#define copy4 \
*(unsigned int *)dest = *(unsigned int *)source;\ 
dest += 4;\ 
source += 4;\ 

16#define copy8 \
*(unsigned long *)dest = *(unsigned long *)source;\ 
dest += 8;\ 
source += 8;\ 
 
21 
22void mycopy( unsigned char * dest, unsigned char * source, size_t s ){
size_t bytes = 0;
while ( s ){
  switch( s ){
    case 1 : {
         copy1;
         s--;
         break;
       }
    case 2 : {
         copy2;
         s -= 2;
         break;
       }
    case 3 : {
         copy2;
         copy1;
         s -= 3;
         break;
       }
    case 4 : {
         copy4;
         s -= 4;
         break;
       }
    case 5 : {
         copy4;
         copy1;
         s -= 5;
         break;
       }
    case 6 : {
         copy4;
         copy2;
         s -= 6;
         break;
       }
    case 7 : {
         copy4;
         copy2;
         copy1;
         s -= 7;
       }
    default : {
         copy8;
         s -= 8;
        }
  }
}
71}

When run with 32 megs + 3 bytes and 100 times each I get the following.

memcpy - CPU time = 9.766516 secs.
fastcpy - CPU time = 9.682528 secs.
mycop - CPU time = 9.649533 secs.

Peter Wang

I only tried Evert's test quickly, but I found that the results depended on the order in which the functions were called. Whichever of fastcpy() or memcpy() was called second was faster. If I then repeated the calls then the came out pretty much the same. I haven't investigated, but I assume it's due to cache effects.

HoHo

Quote:

I haven't investigated, but I assume it's due to cache effects.

When moving 256M of data around, 0.5-2M of cache usually doesn't make much of a difference

[edit]
Though, your test are correct:
CPU time = 0.500031 secs, 511.968258 MiB/s. //memcpy
CPU time = 0.156010 secs, 1640.920454 MiB/s. //memcpy
CPU time = 0.216013 secs, 1185.113859 MiB/s. //fastcpy
CPU time = 0.156010 secs, 1640.920454 MiB/s. //fastcpy
CPU time = 0.160010 secs, 1599.900006 MiB/s. //thingie from net
CPU time = 0.156010 secs, 1640.920454 MiB/s. //thingie from net

It might be about chache but probably not data cache. Perhaps code gets cached and some jumps get predicted better in subsequent runs. Also walking through the arrays might help for some reason.

Thomas Fjellstrom

#define copy8 \
  *(unsigned long *)dest = *(unsigned long *)source;\  
  dest += 8;\ 
  source += 8;\

That doesnt actually coppy 8 bytes you know. At least not on ia32. You need uint64_t or unsigned long long.

edit: fixed typo.

HoHo

Quote:

You need uint32_t or unsigned long long.

No, you would need either uint64_t or unsigned long long

Too bad I don't have the DDJ journal with me right now. They had a code in there that could acieve performandce of about 5.5GiB/s on nullifying 10MiB of memory and ~12GiB/s when nulling 128kiB.
Their test machine was 3GHz P4 with 800MHz FSB (theoretical maximum throughput 6.4GiB/s)

Thomas Fjellstrom

Quote:

No, you would need either uint64_t or unsigned long long

Ooops, typo

Karadoc ~~

I haven't tried it. But if this behaves the same as memcpy except faster, then you should send it to the gcc people.

A J

has anyone written patches for blit using any of these techniques?

gering

@HoHo

Quote:

[edit]
Using code from here [vik.cc] and combining it with Evert's test I got this result:
CPU time = 0.532032 secs. // memcpy
CPU time = 0.232016 secs. // fastcpy
CPU time = 0.168011 secs. // code from url

[edit2]
A little more statistics
CPU time = 0.496030 secs, 516.097817 MiB/s.
CPU time = 0.272017 secs, 941.117651 MiB/s.
CPU time = 0.188012 secs, 1361.615216 MiB/s.

I know this url, too. Had also tests with this version of memcpy... but it was always same speed as fastcopy on 32Bit machines. On 64Bit I got an segmentation fault with "code from url".

@Karadoc

Quote:

I haven't tried it. But if this behaves the same as memcpy except faster, then you should send it to the gcc people.

8-)
does someone have the gcc implementation?

@A J

Quote:

has anyone written patches for blit using any of these techniques?

I have written a <vector>, <list> and <string>... getting a lot faster than stl.

here the memmove:

1void* fastmove(void* dst, void* src, unsigned long count)
2{
3  unsigned char* dst8 = (unsigned char*)dst;
4  unsigned char* src8 = (unsigned char*)src;
5 
6  if (dst8 <= src8 || src8 + count < dst8)
7  {
8    fastcopy(dst8, src8, count);
9  }
10  else
11  {
12    long delta = (src8 + count) - dst8;
13 
14    fastcopy(dst8 + delta, src8 + delta, delta);
15    fastcopy(dst8, src8, count - delta);
16  }
17 
18  return dst;
19}

[edit]
Can someone test if the fastcopy copies the data correctly? :-/ plssss

Arthur Kalliokoski

I've tried using 64 bit mmx registers in asm to blit, it's about 8% faster (on my machine) than using a slightly unrolled loop using the 32 bit general registers. The Pentium 3? or was it 4? can use SSE registers with 128 bits at a time. You can't use fpu registers because some bit combinations will cause exceptions, slowing things down. I don't have SSE stuff so I can't test that. The AMD 64 bitters I have no idea about.

AMD K6-2 does a string (rep stosd) set about as fast as an unrolled loop with 32 bit regs, Intel is faster with the 32 bit regs. Copying is always faster than the string instructions with regs on all my computers.

So I always use MMX when I'm trying for max FPS or whatever.

A J

how about using the 64bit general purpose registers of the AMD64.. thats got to be quicker than the 64bit MMX registers? or is the limit still the FSB?

gering

1void* zMem::Copy(void* dst, const void* src, unsigned long cnt)
2{
3  b32_t* dst32 = (b32_t*)dst;    // 256 Bit
4  b32_t* src32 = (b32_t*)src;
5 
6  unsigned long  cnt32 = cnt / sizeof(b32_t);
7  unsigned long  rst32 = cnt % sizeof(b32_t);
8  
9  while (cnt32-- > 0) *dst32++ = *src32++;
10 
11  unsigned char* dst1 = (unsigned char*)dst32;
12  unsigned char* src1 = (unsigned char*)src32;
13 
14  while (rst32-- > 0) *dst1++ = *src1++;
15 
16  return dst;
17}

This has same speed ... and schould work correctly...
The main speed optimation to libc is the use of
typedef struct { unsigned char byte[32]; } b32_t;
no Duff's Device is needed.

A J

I have read in some of the P4/AMD64/ICC optimzation guides that array indexing is preferred over pointers, as this helps the compiler to avoid aliasing (or whatever its called), with pointers the compiler can not guarantee src and dst are not the same, or could overlap, therefore can not make many optimzations, however using array indexing, the compiler can figure out they dont overlap and can perform pipeline/SSE/SIMD/cache optimzations.

Arthur Kalliokoski

The aliasing issue is supposed to be that if you wind up with two pointers pointing to the same thing, one pointer could alter the "true" variable, and if this variable was also in a register somewhere else, the register would have the wrong value, like volitile variable. There's supposed to be a switch to turn off the aliasing (i.e. you tell the compiler you're smart enough not to do that), but from what I've seen it doesn't always work.

Thread #551536. Printed from Allegro.cc

1	typedef struct { unsigned char dummy [32]; } DT;
2	void fastcpy(unsigned char* dst, unsigned char* src, size_t s)
3	{
4	unsigned char* sa = src+s;
5	DT d1 = (DT)dst - 1;
6	DT s1 = (DT)src - 1;
7	size_t si = s / sizeof(DT);
8
9	si = (si + 7) / 8;
10	switch(si % 8)
11	{
12	case 0: do { ++d1 = ++s1;
13	case 7: ++d1 = ++s1;
14	case 6: ++d1 = ++s1;
15	case 5: ++d1 = ++s1;
16	case 4: ++d1 = ++s1;
17	case 3: ++d1 = ++s1;
18	case 2: ++d1 = ++s1;
19	case 1: ++d1 = ++s1;
20	} while(--si > 0);
21	}
22
23	dst = (unsigned char*)d1;
24	src = (unsigned char*)s1;
25	while(src < sa)
26	{
27	++dst = ++src;
28	}
29	}

1	#include <string.h>
2	#include <stdio.h>
3	#include <time.h>
4	#include <sys/time.h>
5	#include <sys/resource.h>
6	#include <sys/types.h>
7
8	#define CPUTIME (getrusage(RUSAGE_SELF,&ruse),\
9	ruse.ru_utime.tv_sec + ruse.ru_stime.tv_sec + \
10	1e-6 * (ruse.ru_utime.tv_usec + ruse.ru_stime.tv_usec))
11
12	struct rusage ruse;
13
14	extern int getrusage();
15
16	typedef struct { unsigned char dummy [32]; } DT;
17	void fastcpy(unsigned char* dst, unsigned char* src, size_t s)
18	{
19	unsigned char* sa = src+s;
20	DT d1 = (DT)dst - 1;
21	DT s1 = (DT)src - 1;
22	size_t si = s / sizeof(DT);
23
24	si = (si + 7) / 8;
25	switch(si % 8)
26	{
27	case 0: do { ++d1 = ++s1;
28	case 7: ++d1 = ++s1;
29	case 6: ++d1 = ++s1;
30	case 5: ++d1 = ++s1;
31	case 4: ++d1 = ++s1;
32	case 3: ++d1 = ++s1;
33	case 2: ++d1 = ++s1;
34	case 1: ++d1 = ++s1;
35	} while(--si > 0);
36	}
37
38	dst = (unsigned char*)d1;
39	src = (unsigned char*)s1;
40	while(src < sa)
41	{
42	++dst = ++src;
43	}
44	}
45
46	int main(void)
47	{
48	char s1, s2, *s3;
49	unsigned int size;
50	double t0, t1;
51
52	size = 25610241024;
53	s1 = malloc(size);
54	s2 = malloc(size);
55	s3 = malloc(size);
56
57	s1[size-1] = 1;
58	s2[size-1] = 2;
59	s3[size-1] = 3;
60
61	t0 = CPUTIME;
62	memcpy(s1, s2, size);
63	t1 = CPUTIME;
64	printf("CPU time = %f secs.\n",t1-t0);
65	t0 = CPUTIME;
66	fastcpy(s1, s3, size);
67	t1 = CPUTIME;
68	printf("CPU time = %f secs.\n",t1-t0);
69	}

1	#define copy1 \
2	dest = source;\
3	dest++;\
4	source++;
5
6	#define copy2 \
7	(unsigned short )dest = (unsigned short )source;\
8	dest += 2;\
9	source += 2;
10
11	#define copy4 \
12	(unsigned int )dest = (unsigned int )source;\
13	dest += 4;\
14	source += 4;\
15
16	#define copy8 \
17	(unsigned long )dest = (unsigned long )source;\
18	dest += 8;\
19	source += 8;\
20
21
22	void mycopy( unsigned char * dest, unsigned char * source, size_t s ){
23	size_t bytes = 0;
24	while ( s ){
25	switch( s ){
26	case 1 : {
27	copy1;
28	s--;
29	break;
30	}
31	case 2 : {
32	copy2;
33	s -= 2;
34	break;
35	}
36	case 3 : {
37	copy2;
38	copy1;
39	s -= 3;
40	break;
41	}
42	case 4 : {
43	copy4;
44	s -= 4;
45	break;
46	}
47	case 5 : {
48	copy4;
49	copy1;
50	s -= 5;
51	break;
52	}
53	case 6 : {
54	copy4;
55	copy2;
56	s -= 6;
57	break;
58	}
59	case 7 : {
60	copy4;
61	copy2;
62	copy1;
63	s -= 7;
64	}
65	default : {
66	copy8;
67	s -= 8;
68	}
69	}
70	}
71	}

1	void* fastmove(void* dst, void* src, unsigned long count)
2	{
3	unsigned char* dst8 = (unsigned char*)dst;
4	unsigned char* src8 = (unsigned char*)src;
5
6	if (dst8 <= src8 \|\| src8 + count < dst8)
7	{
8	fastcopy(dst8, src8, count);
9	}
10	else
11	{
12	long delta = (src8 + count) - dst8;
13
14	fastcopy(dst8 + delta, src8 + delta, delta);
15	fastcopy(dst8, src8, count - delta);
16	}
17
18	return dst;
19	}

1	void* zMem::Copy(void* dst, const void* src, unsigned long cnt)
2	{
3	b32_t* dst32 = (b32_t*)dst; // 256 Bit
4	b32_t* src32 = (b32_t*)src;
5
6	unsigned long cnt32 = cnt / sizeof(b32_t);
7	unsigned long rst32 = cnt % sizeof(b32_t);
8
9	while (cnt32-- > 0) dst32++ = src32++;
10
11	unsigned char* dst1 = (unsigned char*)dst32;
12	unsigned char* src1 = (unsigned char*)src32;
13
14	while (rst32-- > 0) dst1++ = src1++;
15
16	return dst;
17	}