Allegro.cc - Online Community

Allegro.cc Forums » Programming Questions » Hash function shootout!

This thread is locked; no one can reply to it. rss feed Print
 1   2 
Hash function shootout!
Paul Pridham
Member #250
April 2000
avatar

Lately I have been looking into hash functions, as I wanted something that would distribute evenly over a power-of-two-sized table, using ASCII key values, and would be fast.

Some of the common and popular hash functions I tried performed poorly, so I decided to try writing my own based on a hunch. And whaddya know, it seems to kick some butt!

Below is some test code which runs a variety of tests on a bunch of hash functions and spits out some statistics and such. The functions I wrote are the hash_rand*() ones, and in particular, hash_rand2() seems to be about the best one to use. Also included are hash functions PJW, CRC, One At A Time, Super Fast Hash, and the Allegro hash function!

[code]
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>

#define TEST_COUNT 100000000

#define TABLE_SIZE 4096
#define KEY_COUNT (TABLE_SIZE*5)
#define KEY_LEN 6
#define MASK (TABLE_SIZE-1)

char test[KEY_COUNT*KEY_LEN];
int key_hits[TABLE_SIZE];


unsigned int hash_rand(char *key, unsigned int len)
{
unsigned int i, seed=0;

for(i=0; i<len; i++)
{
seed=seed<<2;
seed+=key[i];
}

seed = seed * 1103515245 + 12345;
return (unsigned int)(seed>>16) & 65535;
}


// ** WINNER! **
unsigned int hash_rand2(char *key, unsigned int len)
{
unsigned int i, seed=0;

for(i=0; i<len; i++)
{
seed=seed<<2;
seed+=key[i];
}

return seed * 1103515245 + 12345;
}


unsigned int hash_rand3(char *key, unsigned int len)
{
unsigned int i, seed=0;

for(i=0; i<len; i++)
{
seed=seed<<2;
seed+=key[i];
}

return (seed+1) * 1103515245 + 12345;
}


unsigned int hash_rand4(char *key, unsigned int len)
{
unsigned int i, seed=0;

for(i=0; i<len; i++)
{
seed+=key[i];
}

return (seed+1) * 1103515245 + 12345;
}


unsigned int hash_crc(char *key, unsigned int len)
{
unsigned int hash=0, i, highorder;

for (i=0; i<len; ++i)
{
highorder = hash & 0xf8000000;
hash = hash<<8;
hash ^= highorder>>27;
hash ^= key[i];
}

return hash;
}


unsigned int hash_pjw(char *key, unsigned int len)
{
unsigned int h = 0, g, i;

for(i = 0; i<len; i++)
{
h = (h << 4) + key[i];
if((g = h & 0xf0000000))
{
h = h ^ (g >> 24);
h = h ^ g;
}
}

return h;
}


// "One At A Time" hash
unsigned int hash_oaat(char *key, unsigned int len)
{
unsigned int hash, i;

for (hash=0, i=0; i<len; ++i)
{
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}

hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);

return hash;
}


#define HASH_ROTATION 3

unsigned int hash_allegro(char *key, unsigned int len)
{
unsigned int hash=0, i;

for (i = 0; i<len; i++)
{
hash = ((hash >> HASH_ROTATION) | ((hash << (16 - HASH_ROTATION)) &
0xffff)) ^ (int)key[i];
}

return hash;
}


typedef unsigned short uint16;

#define get16bits(d) (*((const uint16*) (d)))

unsigned int hash_superfast(char * data, unsigned int len)
{
unsigned int hash = 0, tmp;
int rem;

if (len <= 0 || data == NULL) return 0;

rem = len & 3;
len >>= 2;

/* Main loop */
for (;len > 0; len--)
{
hash += get16bits (data);
tmp = (get16bits (data+2) << 11) ^ hash;
hash = (hash << 16) ^ tmp;
data += 2*sizeof (uint16);
hash += hash >> 11;
}

/* Handle end cases */
switch (rem)
{
case 3: hash += get16bits (data);
hash ^= hash << 16;
hash ^= data[sizeof (uint16)] << 18;
hash += hash >> 11;
break;
case 2: hash += get16bits (data);
hash ^= hash << 11;
hash += hash >> 17;
break;
case 1: hash += *data;
hash ^= hash << 10;
hash += hash >> 1;
}

/* Force "avalanching" of final 127 bits */
hash ^= hash << 3;
hash += hash >> 5;
hash ^= hash << 2;
hash += hash >> 15;
hash ^= hash << 10;

return hash;
}


void clear_hits()
{
memset(key_hits, 0, TABLE_SIZE*sizeof(int));
}


int get_hits(int *chain, int *unused)
{
int i, collisions=0;

*chain=0;
*unused=0;

for(i=0; i<TABLE_SIZE; i++)
{
if(key_hits[i]>1)
{
collisions+=key_hits[i]-1;
}

if(key_hits[i]>*chain)
{
*chain=key_hits[i];
}

if(!key_hits[i])
{
(*unused)++;
}
}

return collisions;
}


void emit(char *ptr, int len)
{
int i;

for(i=0; i<len; i++)
{
putchar(*(ptr+i));
}
}


void show_distribution()
{
int i;
int sum=0, count=0;
int tenth=0;
int total=0;

for(i=0; i<TABLE_SIZE; i++)
{
sum+=key_hits[i];
total+=key_hits[i];

if(count>=TABLE_SIZE/10)
{
count=0;
tenth++;
printf("%2d: %2d (%.4f)\n", tenth, sum, ((float)sum/KEY_COUNT)*100);
sum=0;
}
count++;
}
}


float get_variance()
{
int i;
int sum=0, count=0;
int tenth=0;
int total=0;
float mean;
float variance = 0;

for(i=0; i<TABLE_SIZE; i++)
{
total+=key_hits[i];
count++;
}

mean = (float)total / (float)count;

for(i=0; i<TABLE_SIZE; i++)
{
float d = key_hits[i] - mean;
variance += d * d;
}

variance /= (float)count;

return variance;
}


void test_hash(char *name, unsigned int (*func)(char*, unsigned int))
{
unsigned int hash, i, longest, unused;

clear_hits();

for(i=0; i<KEY_COUNT; i++)
{
hash=func(&test[i*KEY_LEN], KEY_LEN);
key_hits[hash&MASK]++;
}

printf("%s distribution:\n", name);
show_distribution();

printf("%s collisions: %d\n", name, get_hits(&longest, &unused));
printf("%s unused: %d\n", name, unused);
printf("%s longest chain: %d\n", name, longest);
printf("%s variance: %f\n", name, get_variance());
}


void time_hash(char *name, unsigned int (*func)(char*, unsigned int))
{
unsigned int dur, i;

dur=clock();

for(i=0; i<TEST_COUNT; i++)
{
func(test, KEY_LEN);
}

dur=clock()-dur;

printf("%s %d: %fs\n\n", name, TEST_COUNT, (double)dur/CLOCKS_PER_SEC);
}


int main()
{
int i;

srand(time(NULL));

for(i=0; i<KEY_COUNT*KEY_LEN; i++)
{

// printable ASCII
// test[i]=32+rand()%94;

// lowercase alphabet - PJW does worse here
test[i]=97+rand()%26;
}

printf("* Table size: %d, keys: %d\n\n", TABLE_SIZE, KEY_COUNT);


test_hash("crc", hash_crc);
time_hash("crc", hash_crc);

test_hash("pjw", hash_pjw);
time_hash("pjw", hash_pjw);

test_hash("OAAT", hash_oaat);
time_hash("OAAT", hash_oaat);

test_hash("superfast", hash_superfast);
time_hash("superfast", hash_superfast);

test_hash("allegro", hash_allegro);
time_hash("allegro", hash_allegro);

test_hash("rand", hash_rand);
time_hash("rand", hash_rand);

test_hash("rand2", hash_rand2);
time_hash("rand2", hash_rand2);

test_hash("rand3", hash_rand3);
time_hash("rand3", hash_rand3);

test_hash("rand4", hash_rand4);
time_hash("rand4", hash_rand4);

getchar();

return 0;
}
[/code]

And here are the results on my machine (AMD Duron), compiled with MSVC 6:

[code]
* Table size: 4096, keys: 20480

crc distribution:
1: 1532 (7.4805)
2: 790 (3.8574)
3: 2284 (11.1523)
4: 1683 (8.2178)
5: 2454 (11.9824)
6: 3143 (15.3467)
7: 1542 (7.5293)
8: 2929 (14.3018)
9: 1491 (7.2803)
10: 2632 (12.8516)
crc collisions: 20032
crc unused: 3648
crc longest chain: 82
crc variance: 224.945313
crc 100000000: 6.439000s

pjw distribution:
1: 2399 (11.7139)
2: 1514 (7.3926)
3: 1256 (6.1328)
4: 1227 (5.9912)
5: 1542 (7.5293)
6: 2325 (11.3525)
7: 2701 (13.1885)
8: 2427 (11.8506)
9: 2381 (11.6260)
10: 2683 (13.1006)
pjw collisions: 16555
pjw unused: 171
pjw longest chain: 19
pjw variance: 11.394043
pjw 100000000: 7.521000s

OAAT distribution:
1: 2081 (10.1611)
2: 2066 (10.0879)
3: 1986 (9.6973)
4: 1990 (9.7168)
5: 2055 (10.0342)
6: 2055 (10.0342)
7: 2098 (10.2441)
8: 1998 (9.7559)
9: 2077 (10.1416)
10: 2040 (9.9609)
OAAT collisions: 16411
OAAT unused: 27
OAAT longest chain: 15
OAAT variance: 4.978516
OAAT 100000000: 9.043000s

superfast distribution:
1: 2067 (10.0928)
2: 1996 (9.7461)
3: 2026 (9.8926)
4: 2063 (10.0732)
5: 1979 (9.6631)
6: 2092 (10.2148)
7: 2071 (10.1123)
8: 2031 (9.9170)
9: 1996 (9.7461)
10: 2136 (10.4297)
superfast collisions: 16407
superfast unused: 23
superfast longest chain: 16
superfast variance: 5.103516
superfast 100000000: 6.359000s

allegro distribution:
1: 1834 (8.9551)
2: 2145 (10.4736)
3: 2025 (9.8877)
4: 2034 (9.9316)
5: 2276 (11.1133)
6: 1994 (9.7363)
7: 2159 (10.5420)
8: 1983 (9.6826)
9: 1830 (8.9355)
10: 2177 (10.6299)
allegro collisions: 16423
allegro unused: 39
allegro longest chain: 14
allegro variance: 5.588867
allegro 100000000: 6.589000s

rand distribution:
1: 2012 (9.8242)
2: 2034 (9.9316)
3: 1970 (9.6191)
4: 2053 (10.0244)
5: 2132 (10.4102)
6: 2107 (10.2881)
7: 2014 (9.8340)
8: 2024 (9.8828)
9: 2041 (9.9658)
10: 2071 (10.1123)
rand collisions: 16416
rand unused: 32
rand longest chain: 15
rand variance: 5.151367
rand 100000000: 4.437000s

rand2 distribution:
1: 2044 (9.9805)
2: 2095 (10.2295)
3: 1966 (9.5996)
4: 2037 (9.9463)
5: 2070 (10.1074)
6: 1963 (9.5850)
7: 2080 (10.1563)
8: 2046 (9.9902)
9: 2114 (10.3223)
10: 2038 (9.9512)
rand2 collisions: 16428
rand2 unused: 44
rand2 longest chain: 16
rand2 variance: 5.331055
rand2 100000000: 4.446000s

rand3 distribution:
1: 2086 (10.1855)
2: 1957 (9.5557)
3: 2039 (9.9561)
4: 2079 (10.1514)
5: 1965 (9.5947)
6: 2074 (10.1270)
7: 2044 (9.9805)
8: 2114 (10.3223)
9: 2042 (9.9707)
10: 2048 (10.0000)
rand3 collisions: 16428
rand3 unused: 44
rand3 longest chain: 16
rand3 variance: 5.331055
rand3 100000000: 4.416000s

rand4 distribution:
1: 2038 (9.9512)
2: 1916 (9.3555)
3: 2008 (9.8047)
4: 2049 (10.0049)
5: 2076 (10.1367)
6: 2025 (9.8877)
7: 1948 (9.5117)
8: 2321 (11.3330)
9: 2043 (9.9756)
10: 2056 (10.0391)
rand4 collisions: 20357
rand4 unused: 3973
rand4 longest chain: 459
rand4 variance: 1523.432129
rand4 100000000: 4.437000s
[/code]

Feel free to run your own tests, pick it apart, and finally bow down to hash_rand2() as teh ultimaet hash and submit to using it in your own projects! :D ;)
Elias
Member #358
May 2000

Here's results from me, P4, compiled with -march=pentium4 -O3 with gcc 4.0, and with the key length changed from 6 to 32:

1* Table size: 4096, keys: 20480
2 
3crc distribution:
4 1: 1603 (7.8271)
5 2: 796 (3.8867)
6 3: 2123 (10.3662)
7 4: 1700 (8.3008)
8 5: 2407 (11.7529)
9 6: 3168 (15.4688)
10 7: 1572 (7.6758)
11 8: 2940 (14.3555)
12 9: 1431 (6.9873)
1310: 2740 (13.3789)
14crc collisions: 20032
15crc unused: 3648
16crc longest chain: 82
17crc variance: 225.203613
18crc 100000000: 6.330000s
19 
20pjw distribution:
21 1: 2050 (10.0098)
22 2: 2056 (10.0391)
23 3: 2032 (9.9219)
24 4: 2043 (9.9756)
25 5: 2022 (9.8730)
26 6: 2070 (10.1074)
27 7: 2056 (10.0391)
28 8: 2056 (10.0391)
29 9: 2033 (9.9268)
3010: 2039 (9.9561)
31pjw collisions: 16467
32pjw unused: 83
33pjw longest chain: 16
34pjw variance: 7.468262
35pjw 100000000: 10.420000s
36 
37OAAT distribution:
38 1: 2045 (9.9854)
39 2: 2076 (10.1367)
40 3: 2047 (9.9951)
41 4: 2033 (9.9268)
42 5: 2122 (10.3613)
43 6: 2063 (10.0732)
44 7: 2029 (9.9072)
45 8: 1994 (9.7363)
46 9: 2041 (9.9658)
4710: 2008 (9.8047)
48OAAT collisions: 16408
49OAAT unused: 24
50OAAT longest chain: 14
51OAAT variance: 4.930176
52OAAT 100000000: 9.040000s
53 
54superfast distribution:
55 1: 2110 (10.3027)
56 2: 2056 (10.0391)
57 3: 2051 (10.0146)
58 4: 2078 (10.1465)
59 5: 2027 (9.8975)
60 6: 1996 (9.7461)
61 7: 2039 (9.9561)
62 8: 2035 (9.9365)
63 9: 1998 (9.7559)
6410: 2066 (10.0879)
65superfast collisions: 16412
66superfast unused: 28
67superfast longest chain: 14
68superfast variance: 5.017090
69superfast 100000000: 3.380000s
70 
71allegro distribution:
72 1: 2019 (9.8584)
73 2: 2089 (10.2002)
74 3: 1991 (9.7217)
75 4: 2086 (10.1855)
76 5: 2014 (9.8340)
77 6: 2153 (10.5127)
78 7: 2009 (9.8096)
79 8: 2040 (9.9609)
80 9: 2007 (9.7998)
8110: 2052 (10.0195)
82allegro collisions: 16415
83allegro unused: 31
84allegro longest chain: 17
85allegro variance: 5.188477
86allegro 100000000: 6.980000s
87 
88rand distribution:
89 1: 2039 (9.9561)
90 2: 2022 (9.8730)
91 3: 2029 (9.9072)
92 4: 2084 (10.1758)
93 5: 2066 (10.0879)
94 6: 2007 (9.7998)
95 7: 2010 (9.8145)
96 8: 2088 (10.1953)
97 9: 1986 (9.6973)
9810: 2130 (10.4004)
99rand collisions: 16407
100rand unused: 23
101rand longest chain: 13
102rand variance: 4.880371
103rand 100000000: 5.050000s
104 
105rand2 distribution:
106 1: 2002 (9.7754)
107 2: 2039 (9.9561)
108 3: 2060 (10.0586)
109 4: 2087 (10.1904)
110 5: 2012 (9.8242)
111 6: 1983 (9.6826)
112 7: 2076 (10.1367)
113 8: 2064 (10.0781)
114 9: 2081 (10.1611)
11510: 2051 (10.0146)
116rand2 collisions: 16415
117rand2 unused: 31
118rand2 longest chain: 15
119rand2 variance: 5.338379
120rand2 100000000: 5.110000s
121 
122rand3 distribution:
123 1: 2046 (9.9902)
124 2: 2062 (10.0684)
125 3: 2073 (10.1221)
126 4: 2025 (9.8877)
127 5: 1987 (9.7021)
128 6: 2068 (10.0977)
129 7: 2063 (10.0732)
130 8: 2077 (10.1416)
131 9: 2068 (10.0977)
13210: 1988 (9.7070)
133rand3 collisions: 16415
134rand3 unused: 31
135rand3 longest chain: 15
136rand3 variance: 5.338379
137rand3 100000000: 5.170000s
138 
139rand4 distribution:
140 1: 2006 (9.7949)
141 2: 2020 (9.8633)
142 3: 1995 (9.7412)
143 4: 2003 (9.7803)
144 5: 2030 (9.9121)
145 6: 2078 (10.1465)
146 7: 1967 (9.6045)
147 8: 2019 (9.8584)
148 9: 2149 (10.4932)
14910: 2092 (10.2148)
150rand4 collisions: 20202
151rand4 unused: 3818
152rand4 longest chain: 223
153rand4 variance: 664.569824
154rand4 100000000: 5.670000s

superfast now outperforms rand2. As we talked on IRC, there may be a way to get around the byte by byte adding with rand2 though, which likely is where superfast gets an advantage with the 16bit at once adding.

--
"Either help out or stop whining" - Evert

Paul Pridham
Member #250
April 2000
avatar

As you know, Elias, (since we collaborated on the function ;)) we now have hash_rand5() which apparently blows all other competition out of the water. I'd like to see what the gotcha is, surely there must be something.

Here's the function:

1unsigned int hash_rand5(char *key, unsigned int len)
2{
3 unsigned int i, seed=0, rem;
4 
5 rem = len & 3;
6 
7 len>>=2;
8 
9 for(i=0; i<len; i++)
10 {
11 seed=seed<<2;
12 seed+=((unsigned int*)key)<i>;
13 }
14 
15 switch(rem)
16 {
17 case 1: seed=seed<<2; seed+=((unsigned char*)key)[i << 2];
18 break;
19 
20 case 2: seed=seed<<3; seed+=((unsigned short *)key)[i << 1];
21 break;
22 
23 case 3: seed=seed<<4; seed+=((unsigned short *)key)[i << 1] +
24 ((unsigned char*)key)[i << 2 + 2];
25 break;
26 }
27 
28 return seed * 1103515245 + 12345;
29}

And here are my results for a random lowercase alphabetic key of 63 characters:

1* Table size: 4096, keys: 20480
2 
3crc distribution:
4 1: 1604 (7.8320)
5 2: 786 (3.8379)
6 3: 2106 (10.2832)
7 4: 1772 (8.6523)
8 5: 2519 (12.2998)
9 6: 3025 (14.7705)
10 7: 1609 (7.8564)
11 8: 2917 (14.2432)
12 9: 1434 (7.0020)
1310: 2708 (13.2227)
14crc collisions: 20032
15crc unused: 3648
16crc longest chain: 78
17crc variance: 224.199707
18crc 5000000: 2.654000s
19 
20pjw distribution:
21 1: 2099 (10.2490)
22 2: 2066 (10.0879)
23 3: 1926 (9.4043)
24 4: 2101 (10.2588)
25 5: 2009 (9.8096)
26 6: 2099 (10.2490)
27 7: 2050 (10.0098)
28 8: 2036 (9.9414)
29 9: 2049 (10.0049)
3010: 2036 (9.9414)
31pjw collisions: 16467
32pjw unused: 83
33pjw longest chain: 18
34pjw variance: 7.376953
35pjw 5000000: 4.095000s
36 
37OAAT distribution:
38 1: 2040 (9.9609)
39 2: 2007 (9.7998)
40 3: 2085 (10.1807)
41 4: 2049 (10.0049)
42 5: 2076 (10.1367)
43 6: 2120 (10.3516)
44 7: 2033 (9.9268)
45 8: 2025 (9.8877)
46 9: 2056 (10.0391)
4710: 1966 (9.5996)
48OAAT collisions: 16424
49OAAT unused: 40
50OAAT longest chain: 14
51OAAT variance: 5.213379
52OAAT 5000000: 3.505000s
53 
54superfast distribution:
55 1: 1985 (9.6924)
56 2: 2043 (9.9756)
57 3: 2015 (9.8389)
58 4: 2061 (10.0635)
59 5: 2019 (9.8584)
60 6: 2086 (10.1855)
61 7: 2070 (10.1074)
62 8: 2000 (9.7656)
63 9: 2040 (9.9609)
6410: 2136 (10.4297)
65superfast collisions: 16411
66superfast unused: 27
67superfast longest chain: 13
68superfast variance: 5.139648
69superfast 5000000: 1.341000s
70 
71allegro distribution:
72 1: 2050 (10.0098)
73 2: 2007 (9.7998)
74 3: 2047 (9.9951)
75 4: 2041 (9.9658)
76 5: 2039 (9.9561)
77 6: 1999 (9.7607)
78 7: 2074 (10.1270)
79 8: 2026 (9.8926)
80 9: 2087 (10.1904)
8110: 2083 (10.1709)
82allegro collisions: 16403
83allegro unused: 19
84allegro longest chain: 14
85allegro variance: 4.908203
86allegro 5000000: 2.534000s
87 
88rand distribution:
89 1: 2069 (10.1025)
90 2: 2046 (9.9902)
91 3: 2041 (9.9658)
92 4: 2055 (10.0342)
93 5: 2059 (10.0537)
94 6: 1991 (9.7217)
95 7: 2010 (9.8145)
96 8: 2104 (10.2734)
97 9: 1997 (9.7510)
9810: 2080 (10.1563)
99rand collisions: 16407
100rand unused: 23
101rand longest chain: 13
102rand variance: 4.951660
103rand 5000000: 1.312000s
104 
105rand2 distribution:
106 1: 2040 (9.9609)
107 2: 2053 (10.0244)
108 3: 2070 (10.1074)
109 4: 2019 (9.8584)
110 5: 2092 (10.2148)
111 6: 2083 (10.1709)
112 7: 2083 (10.1709)
113 8: 2013 (9.8291)
114 9: 1970 (9.6191)
11510: 2026 (9.8926)
116rand2 collisions: 16423
117rand2 unused: 39
118rand2 longest chain: 16
119rand2 variance: 5.242188
120rand2 5000000: 1.292000s
121 
122rand3 distribution:
123 1: 2052 (10.0195)
124 2: 2070 (10.1074)
125 3: 2038 (9.9512)
126 4: 2078 (10.1465)
127 5: 2088 (10.1953)
128 6: 2073 (10.1221)
129 7: 2016 (9.8438)
130 8: 1974 (9.6387)
131 9: 2018 (9.8535)
13210: 2046 (9.9902)
133rand3 collisions: 16423
134rand3 unused: 39
135rand3 longest chain: 16
136rand3 variance: 5.242188
137rand3 5000000: 1.312000s
138 
139rand4 distribution:
140 1: 2129 (10.3955)
141 2: 2053 (10.0244)
142 3: 2096 (10.2344)
143 4: 2003 (9.7803)
144 5: 2027 (9.8975)
145 6: 2028 (9.9023)
146 7: 2019 (9.8584)
147 8: 1956 (9.5508)
148 9: 2003 (9.7803)
14910: 2042 (9.9707)
150rand4 collisions: 20081
151rand4 unused: 3697
152rand4 longest chain: 150
153rand4 variance: 461.498535
154rand4 5000000: 1.372000s
155 
156rand5 distribution:
157 1: 2031 (9.9170)
158 2: 2026 (9.8926)
159 3: 2004 (9.7852)
160 4: 2072 (10.1172)
161 5: 2089 (10.2002)
162 6: 2076 (10.1367)
163 7: 2064 (10.0781)
164 8: 2062 (10.0684)
165 9: 2008 (9.8047)
16610: 2014 (9.8340)
167rand5 collisions: 16405
168rand5 unused: 21
169rand5 longest chain: 15
170rand5 variance: 4.972168
171rand5 5000000: 0.581000s

orz
Member #565
August 2000

Those are not good hash functions.

hash_rand collides for strings like this pair:
"my really long test string is super great"
"his really long test string is super great"
The flaw exploited here is that it ignores early letters in long strings. It also has other flaws, like this:
"az"
"bv"
(consecutive letters can cancel each other out easily)

hash_rand2 and hash_rand3 collide on exactly the same things as hash_rand.

hash_rand4 is worse than the previous ones. any reordering of letters results in the same hash.
"ab"
"ba"

hash_rand5 has the same flaws as hash_rand 1-3, but it requires slightly longer strings:
"his really really really REALLY long test string is super super SUPER DUPER great (yes it really is)"
"her really really really REALLY long test string is super super SUPER DUPER great (yes it really is)"

or, here's an example using the other flaw mentioned for hash_rand 1-3: "a???z???"
"b???v???"

Also, for short strings, the latter bytes do not effect the lower bits of the results well. For instance, the difference between "abc " and "abc0" is only in the uppermost 4 bits of the hash.

I recommend you start by reading some on the subject...
Bob Jenkins has some okay stuff written on the subject: http://burtleburtle.net/bob/hash/index.html
Knuth also wrote some on the subject, in The Art of Computer Programming, volume 3 "Sorting and Searching", chapter 6.4.
There's probably lots more resources on the web.

Bob
Free Market Evangelist
September 2000
avatar

Once you get your string hash function going, you may want to modify my hash code to manage hash sets.

Link

--
- Bob
[ -- All my signature links are 404 -- ]

Paul Pridham
Member #250
April 2000
avatar

Hey, thanks for bursting my bubble! :'(

Heh, no really, that's exactly why I posted this. :) Orz, your analysis is quite helpful. I have seen Bob Jenkin's page, and in fact many of the hash functions I've tested alongside my rand* family came from there.

I figured there were some gotchas with my functions (other than the obvious ones that my testing revealed for rand4(), which was a test case in badness), especially since they seem to perform so well for the type of cases I'll use them for (and for which the tests are a little biased).

rand5() is definitely the best up to this point, and with a bit of tweaking I'm sure the early characters influence will persist. Even as it stands, I'm pretty happy with the results. Nevertheless, it looks like I'm starting on rand6() now... ;)

Bob, thanks for posting your hash map code! It looks pretty interesting, and I'll definitely give some serious perusal and tweaking (here I was about to write a chained hash table :P) once I get my function sorted out. :)

EDIT:

orz, I've confirmed the first two flaws stated for rand1-3 (early letters, cancelling), and the 2nd flaw mentioned for rand5 (a???z???) but have been unable to confirm the early letters flaw in rand5 using a longer string. I've tried up to a 200 character string with only the first letter different, and I'm still getting unique hashes. Can you give me a working example of this flaw?

orz
Member #565
August 2000

stupid smileys make it difficult to read the second example for hash_rand5...

Quote:

orz, I've confirmed the first two flaws stated for rand1-3 (early letters, cancelling), and the 2nd flaw mentioned for rand5 (a???z???) but have been unable to confirm the early letters flaw in rand5 using a longer string. I've tried up to a 200 character string with only the first letter different, and I'm still getting unique hashes. Can you give me a working example of this flaw?

Hm... strange. The two samples (100 characters long) I posted for the early-letters flaw in rand5 both produce a hash of 1675257485 in my test. I'm using exactly the code posted in your earlier post for hash_rand5, except I added a "const" keyword to prevent it from objecting to taking a literal string in C++ (I pasted the code into one of my C++ projects). This is my test code:

  const char *str1 = "his really really really REALLY long test string is super super SUPER DUPER great (yes it really is)";
  const char *str2 = "her really really really REALLY long test string is super super SUPER DUPER great (yes it really is)";
  int len1 = strlen(str1);
  int len2 = strlen(str2);
  int h1 = hash_rand5(str1, len1);
  int h2 = hash_rand5(str2, len2);
  printf("%11d len=%2d str=\"%s\"\n", h1, len1, str1);
  printf("%11d len=%2d str=\"%s\"\n", h2, len2, str2);

Looking at the code for hash_rand5, this should work for any two strings that have the same last 64 characters and the same (length % 4).

Here's a suggestion for testing:
Generate a random string of some length. Treat the string as a long binary number. Loop, adding 1 to the string each time. Hash it, use the low 16 bits of the hash as an index into one table, see if the table is lopsided after a few million calls. Repeat, but interpret the string as a backward number, adding 1 to the last character, and carrying to the second-to-last character.

Bob Jenkins covered most of this somewhere in his stuff, but you've already read some of his stuff and I've already linked him, so I'll try covering it myself:
In order to remove early-characters-losing-relevance, you must not progressively lose information. In your code, you have each iteration of the loop do a "seed=seed<<2;". This means that the highest 2 bits of seed are lost. Nothing uses them. Thus, nothing more than 16 loops ago (32 bit integers, divided by 2 bits per loop) can have any effect upon the value of seed. Here's a few alternatives:
1. "seed = seed ^ (seed << 2);"
This fixes the early-characters problem, but it does not cause any higher bits to effect lower bits. The addition that mixes input in also does not permit higher bits to effect lower bits. Thus, the lower bits of seed will end up sucking. In particular, the lowest bit will be simply the XOR of all input words lowest bits.

2. "seed = seed ^ (seed >> 2);"
This fixes the early-characters problem, and the low-bits problem, but it has serious issues if the input is mixed in with simmple addition. It means that the low bits will not spread out except through the addition, which is a very poor way to spread the influence of a bit. It's very easy for consecutive characters to cancel each other.

3. "seed = (seed << 1) ^ (seed >> 2);"
This fixes the early-characters problem, and the low-bit problem, and improves the bit-spreading issues. It's better than all of the previous examples, and while it still suffers from many problems, it probably does okay for many real world cases. Nearby letters can cancel each other, but it often takes relatively complex relationships to cancel. Far apart letters can cancel, but it usually takes complex relationships between such letters to cancel. Choosing the shift lengths is a bit of a balance. Large shift values mean that many bits in a single iteration are dependant upon only 1 bit in the previous iteration (poor spreading), but small shifts mean that spreading doesn't add up as well across multiple iterations. I believe that the best values pairs tend to involve a right-shift slightly larger than the left-shift if addition is used to mix in input. I know I made some unsupported statements in there, but it's to figure out how to talk about things... I don't know words to explain some of this very well or at all, and I'm not sure that such words even exist, and to do a decent job I'd probably need lots of diagrams...

4. "seed = (seed << 7) | (seed >> 25);"
A circular shift (the shifts add up to the word size). Faster on some hardware than the above examples. Each bit in the output depends upon only one bit in the input, so this isn't great. In particular, inserting 32 consecutive 0s in the input will not change the output, no matter where they're inserted. Still, this does okay for some real-world things. I think Knuth recommended something like this somewhere. It's imporant that the shifts be odd, not even.

5. "seed = seed * 1103515245;"
Mixes decently, except that the high bits of the input don't get spread out, the low bits of the output suck. Could work well with a little extra tweaking. Fast on modern desktops, but slow on some architectures.

Guidelines:
A. (very important guideline)
Don't lose information. Every bit in an intermediate state must have some effect on the next intermediate state, or you will have early-characters problems.

B. (important guideline)
Every input bit should be capable of effecting any bit in the final output, depending upon context (context = other input). In any particular context, swapping a single bit should change roughly half the output bits.

C. (important guideline)
Each intermediate state bit should effect multiple bits in the next intermediate state bit, or you run the chance of things canceling out.

D. (guideline)
If the state size is larger than the input chunk size, then consecutive input chunks will not be able to cancel each other (unles you screw up somehow...). This also may make it harder for even far apart chunks to cancel each other.

E. (my own observation/intuition/recommendation/whatever)
The previous two guidelines (C & D) are very important if the state is small, but not so important if the state is large.

F. (guideline)
Many algorithms are not effected by leading 0s at the begining of a string. That can be handled by using the length of the string as an input in the hash.

F. (my own observation/intuition/recommendation/whatever)
In general, a larger state is capable of doing a better job than a smaller state of comparable complexity. However, very large states may have speed problems, particularly when hashing short strings.

G. (my own observation/intuition/recommendation/whatever)
Multiplication is great on modern desktops & servers, but very slow in embedded contexts. Despite the nice job it does mixing bits, it still has some problems: the high bits of the result are good, but the low bits of the result aren't so good, because they are not effected by the high bits of the input. If multiplication is used as a major factor in the algorithm, you need something else to stir around the upper bits of the input, or they will suffer compared to the lower bits.
Addition mixes bits much slower (particularly if nothing else is stirring them up), but does mix them. It has the same issues as multiplication in that the low bits of the output are uneffected by the high bits of the input.

CGamesPlay
Member #2,559
July 2002
avatar

Quote:

Addition mixes bits much slower

Wait, you're saying that multiplcation is faster time-wise than addition? Or did you mean addition isn't as effective as multiplication?

--
Tomasu: Every time you read this: hugging!

Ryan Patterson - <http://cgamesplay.com/>

orz
Member #565
August 2000

Um, what I was trying to say is that, with multiplication, bits spread to the left very quickly on a per-multiply basis, whereas with addition, they spread to the left by maybe one bit per addition.
On a per operation basis, multiply is much more effective than addition.
On a per-time basis, multiply usually beats addition by a little bit on modern desktop CPUs, but is beaten by addition (by a larger margin) in some embedded contexts. Both operations must be mixed with other operations because they only spread bits leftwards, so the rightmost bits of output and the leftmost bits of input get negleected even if there's no rightshift or other operation that moves or spreads bits rightward.

To illustrate, here's a loop that is bad because of that:

for (int i = 0; i < length; i++) {
  seed += key<i>;
  //bad! both addition and multiplication only spread bits leftwards!
  seed = seed * 1103515245 + 12345;
}

compared to a good loop, or at least, a loop that doesn't have that flaw:

for (int i = 0; i < length; i++) {
  seed += key<i>;
  //okay: the multiply spreads bits leftwards and
  //  the circular shift moves the bits around
  seed = seed * 1103515245;
  seed = (seed >> 17) | (seed << 15);
}

For the bad loop, the low bit of the final output is dependant ONLY upon the low bits of the input. For the good loop, the low bit of the output can be effected by any bit of the input, though a few bits in key[length-1] may have only a minimal effect on it.

CGamesPlay
Member #2,559
July 2002
avatar

So how effective would something like this be?
seed = seed * input - (input << 1)

--
Tomasu: Every time you read this: hugging!

Ryan Patterson - <http://cgamesplay.com/>

Paul Pridham
Member #250
April 2000
avatar

Well, here's the latest incarnation, and I think I'm pretty happy with this. I haven't been able to break the distribution using any orz-like tricks, and it's still a touch faster than SuperFastHash. PLUS, it still uses the rand() functionality to avalanche at the end, thus keeping its classic rand appeal. ;)

1unsigned int hash_rand8(char *key, unsigned int len)
2{
3 unsigned int i, seed=0, rem;
4 
5 rem = len & 3;
6 
7 len>>=2;
8 
9 for(i=0; i<len; i++)
10 {
11 seed ^= seed << 13;
12 seed += ((unsigned int*)key)<i>;
13 seed ^= seed >> 7;
14 }
15 
16 switch(rem)
17 {
18 case 1:
19 seed ^= seed << 2;
20 seed += ((unsigned char*)key)[i << 2];
21 seed ^= seed >> 23;
22 break;
23 
24 case 2:
25 seed ^= seed << 14;
26 seed += ((unsigned short *)key)[i << 1];
27 seed ^= seed >> 9;
28 break;
29 
30 case 3:
31 seed ^= seed << 19;
32 seed += ((unsigned short *)key)[i << 1] +
33 ((unsigned char*)key)[i << 2 + 2];
34 seed ^= seed >> 7;
35 break;
36 }
37 
38 return seed * 1103515245 + 12345;
39}

orz
Member #565
August 2000

Paul Pridham:
The only issues that I see are in the "case 3:" code path, where you add input chunks without doing anything else to them, which would make these strings equivalent: "a b", "b a", except for the other issue: you omit parenthesis on the "i << 2 + 2", which is a no-no, because bitwise operators have counter-intuitive precedence in C, and that ends up meaning "i << 4".
Asside from that it looks like it ought to be a nice hash function for use in a project, but widespread deployment would be hampered by its lack of portability. The portability issues are endianness and alignment. Both could be solved by reading 1 character at a time, though that hurts performance. Notice how Bob Jenkins did it in his hash function, where he reads 1 character at a time, but combines them before hashing so that he can still hash them 32 bits at a time. Specifically, the endianness issue is that the result of casting an array of chars to an array of shorts or ints is dependant upon platform byte order (x86, ARM, and ALPHA will get one result, MIPS, Sparc, and 68k will get a different result, PPC will usually give the MIPS/Sparc/68k result but doesn't always do so), where on x86 (*(const short*)"ab") is ('a' + 256 * 'b'), and on MIPS it's ('a' * 256 + 'b'). The alignment issue is that on x86, if the key pointer value is not a multiple of sizeof(int) (or whatever the pointer is cast to), then there will be a moderate performance hit, on other platforms except ARM there will be a large performance hit, and on ARM it will run faster than x86 but produce bizarre incorrect values :). Alignment issues are avoid by reading data as chars, or by using #ifdefs to detect the platform at compile time and handle it appropriately.
You might consider finding or creating some better tests for hash functions, so that you can make an informed comparison to other high-speed hash functions. I can probably come up with a few tests by adapting some of my random number generation tests... I don't know if there's any kind of standard tests for hash functions though.

CGamesPlay:
That has problems. It has the leftward-only issue (high bits of seed don't spread out), and it adds loss-of-state issues when input is not an odd number.

Subtraction does not fix the leftward-only issue - it is basically the same as addition, a leftwards only operation, and it mixes at the same rate. ("a -= b;" is generally equivalent to "a += 1 + ~b;", and ~ neither moves nor mixes bits)

Really, the only operations I can think of that you can get rightward movement or mixing out of are right-shift, circular shift (implemented with right-shift in C some many assembly languages), carry-based-things (in C often implemented with rightshift, though in assembly often dedicated opcodes are used, like ADC (add-with-carry)), indirection / table lookups (if the word size is large or the array size is small this may need to get combined with a right-shift to get the high bits of the input to be mixed in), modulo (which is too slow, and should generally not be used with an even modulus), modulo-based things (which range from too slow to way way way too slow), high-word-of-multiplication (which, in C, is generally implemented with floating point values or with right-shifts).
In sum, if it doesn't have a right-shift in it, it has to do something weird to compensate, or else it suffers from leftwards-only bit mixing. Many random number generators, including the libc one and the one in allegro suffer from leftwards only bit mixing problems (even Bob Jenkins ISAAC generator has some limited leftward-mixing issues when compiled with very small array sizes).

Paul Pridham
Member #250
April 2000
avatar

orz, ahh... can't believe I overlooked that case. Well anyway, here are three different versions now. One, rand0, mixes every byte, rand8 is as before with the case 3 fix, and rand9 follows your (and Bob Jenkins') lead and assembles 4 bytes before mixing them all together.

<code>
unsigned int hash_rand8(char *key, unsigned int len)
{
unsigned int i, seed=0, rem;

rem = len & 3;

len>>=2;

for(i=0; i<len; i++)
{
seed ^= seed << 13;
seed += ((unsigned int*)key)<i>;
seed ^= seed >> 7;
}

switch(rem)
{
case 1:
seed ^= seed << 2;
seed += ((unsigned char*)key)[i << 2];
seed ^= seed >> 23;
break;

case 2:
seed ^= seed << 14;
seed += ((unsigned short *)key)[i << 1];
seed ^= seed >> 9;
break;

case 3:
seed ^= seed << 19;
seed += (((unsigned short *)key)[i << 1] << 8 ) +
((unsigned char*)key)[(i << 2) + 2];
seed ^= seed >> 7;
break;
}

return seed * 1103515245 + 12345;
}

unsigned int hash_rand9(char *key, unsigned int len)
{
unsigned int i, seed=0, rem;

rem = len & 3;

len>>=2;

for(i=0; i<len; i++)
{
seed ^= seed << 13;

seed += key[0];
seed += key[1]<<8;
seed += key[2]<<16;
seed += key[3]<<24;

seed ^= seed >> 7;

key+=4;
}

switch(rem)
{
case 1:
seed ^= seed << 2;
seed += key[0];
seed ^= seed >> 23;
break;

case 2:
seed ^= seed << 14;
seed += key[0];
seed += key[1]<<8;
seed ^= seed >> 9;
break;

case 3:
seed ^= seed << 19;
seed += key[0];
seed += key[1]<<8;
seed += key[2]<<16;
seed ^= seed >> 7;
break;
}

return seed * 1103515245 + 12345;
}

unsigned int hash_rand0(char *key, unsigned int len)
{
unsigned int i, seed=0;

for(i=0; i<len; i++)
{
seed ^= seed << 13;
seed += key<i>;
seed ^= seed >> 7;
}

return seed * 1103515245 + 12345;
}
<code>

orz
Member #565
August 2000

It all generally looks correct now.

The most incorrect looking things left are that they int is assumed to be a 32 bit type, and (in hash_rand9 only), and char is assumed to get promoted to int when leftshifted. Both of those assumptions are true on most CPUs & compilers these days, but in embedded contexts you can still find some that don't follow those assumptions. int size can be handled with a typedef or #define, promotion assumptions can be handled by explicit typecasting (as Bob Jenkins does in his hash function).

Also, you have the case set up so that hash_rand8 and hash_rand9 produce different results right now (in the switch statement at the end, on case 3). It might be preferable if they produced identical results (at least on little-endian machines), so that one could be a drop in replacement for the other or something.

Neil Walker
Member #210
April 2000
avatar

Have you a specific use for your hash functions? As choosing the fastest might not necessarily be the best.

Neil.

Neil.
MAME Cabinet Blog / AXL LIBRARY (a games framework) / AXL Documentation and Tutorial

wii:0356-1384-6687-2022, kart:3308-4806-6002. XBOX:chucklepie

Paul Pridham
Member #250
April 2000
avatar

orz, again some good insight -- you've been very helpful. :) I've made the recommended changes.

Neil, I have a few different use for my functions, and not all of them will need the runtime speed. Ie. loading bitmaps, samples etc. into named entries of a hash table for configuration purposes. The resources will then be used to fill in the various data structures that will be used at runtime, such as animation sequences, scripts, sound sequences. The speed isn't needed here because the resulting data structures will directly address the various resources. However, having named items accessible to scripts/configuration is nice.

For speedier applications, the hash functions will be used as part of my scripting system to speed up dictionary accesses. Also, they will provide a means to support a fast associative array scripting datatype, as in Lua or other prototyped OO languages.

As you can see, the latter can be used to support the former as well. There are a ton of uses I'll put the functions to work in, to be sure.

Anyway, the final results are in... hash_rand8 > superfasthash! The key sequence I used for this test was as follows:

XXXXXsuper really really really really really REALLY long test string is super super super super SUPER DUPER super SUPER DUPER super SUPER DUPER great (yes it really is!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!)

...where XXXXX is a hex count from 00000 to 50000. This key really trashed the lesser hash functions, as you can see from the results below. The previous tests used a much smaller (20480 entries) table of 61-byte-long, randomized, lowercase alphabetical keys which seemed to improve hashing ability all around. No longer. ;)

#SelectExpand
1* Table size: 65536, keys: 327680 2 3crc distribution: 4 1: 327680 (100.0000) 5 2: 0 (0.0000) 6 3: 0 (0.0000) 7 4: 0 (0.0000) 8 5: 0 (0.0000) 9 6: 0 (0.0000) 10 7: 0 (0.0000) 11 8: 0 (0.0000) 12 9: 0 (0.0000) 1310: 0 (0.0000) 14crc collisions: 327679 15crc unused: 65535 16crc longest chain: 327680 17crc variance: 1638375.000000 18crc 327680: 0.661000s 19 20pjw distribution: 21 1: 137280 (41.8945) 22 2: 126678 (38.6591) 23 3: 0 (0.0000) 24 4: 0 (0.0000) 25 5: 0 (0.0000) 26 6: 0 (0.0000) 27 7: 0 (0.0000) 28 8: 0 (0.0000) 29 9: 0 (0.0000) 3010: 63722 (19.4464) 31pjw collisions: 327531 32pjw unused: 65387 33pjw longest chain: 8176 34pjw variance: 21188.374908 35pjw 327680: 1.102000s 36 37OAAT distribution: 38 1: 32737 (9.9905) 39 2: 32965 (10.0601) 40 3: 32675 (9.9716) 41 4: 32709 (9.9820) 42 5: 32382 (9.8822) 43 6: 32999 (10.0705) 44 7: 32835 (10.0204) 45 8: 32575 (9.9411) 46 9: 32909 (10.0430) 4710: 32872 (10.0317) 48OAAT collisions: 262603 49OAAT unused: 459 50OAAT longest chain: 18 51OAAT variance: 4.988373 52OAAT 327680: 0.971000s 53 54superfast distribution: 55 1: 32778 (10.0031) 56 2: 32636 (9.9597) 57 3: 32784 (10.0049) 58 4: 32966 (10.0604) 59 5: 32757 (9.9966) 60 6: 32617 (9.9539) 61 7: 32832 (10.0195) 62 8: 32907 (10.0424) 63 9: 32545 (9.9319) 6410: 32837 (10.0211) 65superfast collisions: 262567 66superfast unused: 423 67superfast longest chain: 16 68superfast variance: 5.011139 69superfast 327680: 0.511000s 70 71allegro distribution: 72 1: 25512 (7.7856) 73 2: 35930 (10.9650) 74 3: 33438 (10.2045) 75 4: 30797 (9.3985) 76 5: 33029 (10.0797) 77 6: 28070 (8.5663) 78 7: 38488 (11.7456) 79 8: 33444 (10.2063) 80 9: 32694 (9.9774) 8110: 36242 (11.0602) 82allegro collisions: 262972 83allegro unused: 828 84allegro longest chain: 29 85allegro variance: 12.391602 86allegro 327680: 0.701000s 87 88rand distribution: 89 1: 0 (0.0000) 90 2: 0 (0.0000) 91 3: 0 (0.0000) 92 4: 0 (0.0000) 93 5: 0 (0.0000) 94 6: 0 (0.0000) 95 7: 327680 (100.0000) 96 8: 0 (0.0000) 97 9: 0 (0.0000) 9810: 0 (0.0000) 99rand collisions: 327679 100rand unused: 65535 101rand longest chain: 327680 102rand variance: 1638375.000000 103rand 327680: 0.431000s 104 105rand2 distribution: 106 1: 0 (0.0000) 107 2: 0 (0.0000) 108 3: 0 (0.0000) 109 4: 0 (0.0000) 110 5: 327680 (100.0000) 111 6: 0 (0.0000) 112 7: 0 (0.0000) 113 8: 0 (0.0000) 114 9: 0 (0.0000) 11510: 0 (0.0000) 116rand2 collisions: 327679 117rand2 unused: 65535 118rand2 longest chain: 327680 119rand2 variance: 1638375.000000 120rand2 327680: 0.420000s 121 122rand3 distribution: 123 1: 0 (0.0000) 124 2: 0 (0.0000) 125 3: 0 (0.0000) 126 4: 0 (0.0000) 127 5: 0 (0.0000) 128 6: 0 (0.0000) 129 7: 0 (0.0000) 130 8: 327680 (100.0000) 131 9: 0 (0.0000) 13210: 0 (0.0000) 133rand3 collisions: 327679 134rand3 unused: 65535 135rand3 longest chain: 327680 136rand3 variance: 1638375.000000 137rand3 327680: 0.431000s 138 139rand4 distribution: 140 1: 27634 (8.4332) 141 2: 33016 (10.0757) 142 3: 35497 (10.8328) 143 4: 31958 (9.7528) 144 5: 30218 (9.2218) 145 6: 37200 (11.3525) 146 7: 31731 (9.6835) 147 8: 28857 (8.8065) 148 9: 34354 (10.4840) 14910: 37215 (11.3571) 150rand4 collisions: 327515 151rand4 unused: 65371 152rand4 longest chain: 8440 153rand4 variance: 24737.681213 154rand4 327680: 0.441000s 155 156rand5 distribution: 157 1: 0 (0.0000) 158 2: 0 (0.0000) 159 3: 0 (0.0000) 160 4: 0 (0.0000) 161 5: 0 (0.0000) 162 6: 0 (0.0000) 163 7: 0 (0.0000) 164 8: 0 (0.0000) 165 9: 0 (0.0000) 16610: 327680 (100.0000) 167rand5 collisions: 327679 168rand5 unused: 65535 169rand5 longest chain: 327680 170rand5 variance: 1638375.000000 171rand5 327680: 0.300000s 172 173rand6 distribution: 174 1: 32632 (9.9585) 175 2: 32959 (10.0583) 176 3: 32844 (10.0232) 177 4: 32660 (9.9670) 178 5: 32470 (9.9091) 179 6: 32908 (10.0427) 180 7: 32880 (10.0342) 181 8: 32775 (10.0021) 182 9: 32654 (9.9652) 18310: 32871 (10.0314) 184rand6 collisions: 311296 185rand6 unused: 49152 186rand6 longest chain: 49 187rand6 variance: 84.395538 188rand6 327680: 0.441000s 189 190rand7 distribution: 191 1: 327680 (100.0000) 192 2: 0 (0.0000) 193 3: 0 (0.0000) 194 4: 0 (0.0000) 195 5: 0 (0.0000) 196 6: 0 (0.0000) 197 7: 0 (0.0000) 198 8: 0 (0.0000) 199 9: 0 (0.0000) 20010: 0 (0.0000) 201rand7 collisions: 327679 202rand7 unused: 65535 203rand7 longest chain: 327680 204rand7 variance: 1638375.000000 205rand7 327680: 0.340000s 206 207rand8 distribution: 208 1: 32660 (9.9670) 209 2: 32595 (9.9472) 210 3: 32798 (10.0092) 211 4: 32815 (10.0143) 212 5: 32771 (10.0009) 213 6: 32731 (9.9887) 214 7: 32505 (9.9197) 215 8: 32920 (10.0464) 216 9: 32875 (10.0327) 21710: 32986 (10.0665) 218rand8 collisions: 262584 219rand8 unused: 440 220rand8 longest chain: 18 221rand8 variance: 5.009186 222rand8 327680: 0.471000s 223 224rand9 distribution: 225 1: 32660 (9.9670) 226 2: 32595 (9.9472) 227 3: 32798 (10.0092) 228 4: 32815 (10.0143) 229 5: 32771 (10.0009) 230 6: 32731 (9.9887) 231 7: 32505 (9.9197) 232 8: 32920 (10.0464) 233 9: 32875 (10.0327) 23410: 32986 (10.0665) 235rand9 collisions: 262584 236rand9 unused: 440 237rand9 longest chain: 18 238rand9 variance: 5.009186 239rand9 327680: 0.541000s 240 241rand0 distribution: 242 1: 32947 (10.0546) 243 2: 33048 (10.0854) 244 3: 32361 (9.8758) 245 4: 32755 (9.9960) 246 5: 32558 (9.9359) 247 6: 32809 (10.0125) 248 7: 32910 (10.0433) 249 8: 32640 (9.9609) 250 9: 32930 (10.0494) 25110: 32682 (9.9738) 252rand0 collisions: 262574 253rand0 unused: 430 254rand0 longest chain: 17 255rand0 variance: 4.976532 256rand0 327680: 0.951000s

As you can see, the later rand-family hash functions held their own against the big boys, producing comparable results, and even toppled the venerable allegro hash in distribution ability. rand8 was the winner in speed overall for the "good" functions, and was comparable for the other stats.

Next I'll have to take another peek at Bob's hashmap code. :)

orz
Member #565
August 2000

Oh, also, those hash functions are unaffected by leading 0s, at least if they occur in sets of four. Consider initializing seed to len (as Bob Jenkins does if I recall correctly), or some other such fix for that.

Here is the results of a test I wrote. The test generates a random 50 byte string of binary data, hashes it, changes a random bit, hashes it again, etc, (1<<18) times. The complication is, it checks that it hasn't accidentally gone back to a string it used earlier. For "norm", it changes any bit in the string, for "low" it changes only the first 48 bits, for "high" it changes only the last 48 bits. The result is the number of hashes produced that were already produced earlier.

1 md5 norm test: 8 collisions
2 md5 low test: 8 collisions
3 md5 high test: 9 collisions
4 
5bobjenkins norm test: 9 collisions
6bobjenkins low test: 8 collisions
7bobjenkins high test: 5 collisions
8 
9 crc norm test: 224014 collisions
10 crc low test: 262143 collisions
11 crc high test: 36001 collisions
12 
13 pjw norm test: 3517 collisions
14 pjw low test: 5079 collisions
15 pjw high test: 2943 collisions
16 
17 OAAT norm test: 2 collisions
18 OAAT low test: 9 collisions
19 OAAT high test: 12 collisions
20 
21 superfast norm test: 16 collisions
22 superfast low test: 129 collisions
23 superfast high test: 99 collisions
24 
25 allegro norm test: 222198 collisions
26 allegro low test: 262143 collisions
27 allegro high test: 152055 collisions
28 
29 rand norm test: 217674 collisions
30 rand low test: 262143 collisions
31 rand high test: 196681 collisions
32 
33 rand2 norm test: 187825 collisions
34 rand2 low test: 262143 collisions
35 rand2 high test: 91565 collisions
36 
37 rand3 norm test: 187642 collisions
38 rand3 low test: 262143 collisions
39 rand3 high test: 90975 collisions
40 
41 rand4 norm test: 258556 collisions
42 rand4 low test: 260884 collisions
43 rand4 high test: 260893 collisions
44 
45 rand5 norm test: 114573 collisions
46 rand5 low test: 261628 collisions
47 rand5 high test: 18530 collisions
48 
49 rand8 norm test: 13 collisions
50 rand8 low test: 19 collisions
51 rand8 high test: 33 collisions
52 
53 rand9 norm test: 12 collisions
54 rand9 low test: 29 collisions
55 rand9 high test: 37 collisions
56 
57 rand0 norm test: 3 collisions
58 rand0 low test: 11 collisions
59 rand0 high test: 10 collisions

Any value up to 15 can be reasonably dismissed as random fluctuations. Thus, OAAT and rand0 produce perfect results, superfast, rand8, and rand9 produce acceptable results, and everything else produces bad results. "md5", and "bobjenkins" are added as sanity checks.

I'll write a more thorough test function tomorrow.

Paul Pridham
Member #250
April 2000
avatar

Ahh, that's great. Could you post your test code when you've finished, so I can cobble together a comprehensive mega-test?

And do you mean the leading 4 zeroes in the key, or the seed?

orz
Member #565
August 2000

Lets see if this works...

Everything specific to your stuff is in h.cpp. The rest of it is random stuff related to random number generation and testing, which shares a bunch of code with this since it was a quick hack to get the random number tests working on hash functions.

Using longer samples (up to 16 million hashes), hash_rand0 failed a few tests, but OAAT still prevailed. I've started a 256-million-hash test in an effort to get OAAT to break down and cry, but it will be a while before that finishes.

Four leading 0s on the "key" will produce an identical result, like these two string "ab", "\0\0\0\0ab".

If all went well, that should produce output like this:

1 md5 low: 6 collisions, tests: passed 5, failed 0, ambiguous 1
2 md5 norm: 8 collisions, tests: passed 6, failed 0, ambiguous 0
3 md5 high: 6 collisions, tests: passed 5, failed 0, ambiguous 1
4 
5bobjenkins low: 8 collisions, tests: passed 6, failed 0, ambiguous 0
6bobjenkins norm: 4 collisions, tests: passed 5, failed 0, ambiguous 1
7bobjenkins high: 7 collisions, tests: passed 6, failed 0, ambiguous 0
8 
9 crc low: 223878 collisions, tests: passed 0, failed 6, ambiguous 0
10 crc norm: 262143 collisions, tests: passed 0, failed 6, ambiguous 0
11 crc high: 36268 collisions, tests: passed 0, failed 6, ambiguous 0
12 
13 pjw low: 3360 collisions, tests: passed 0, failed 6, ambiguous 0
14 pjw norm: 5037 collisions, tests: passed 0, failed 6, ambiguous 0
15 pjw high: 2962 collisions, tests: passed 0, failed 6, ambiguous 0
16 
17 OAAT low: 8 collisions, tests: passed 6, failed 0, ambiguous 0
18 OAAT norm: 8 collisions, tests: passed 5, failed 0, ambiguous 1
19 OAAT high: 7 collisions, tests: passed 4, failed 0, ambiguous 2
20 
21 superfast low: 22 collisions, tests: passed 6, failed 0, ambiguous 0
22 superfast norm: 145 collisions, tests: passed 5, failed 1, ambiguous 0
23 superfast high: 87 collisions, tests: passed 5, failed 0, ambiguous 1
24 
25 allegro low: 221981 collisions, tests: passed 0, failed 6, ambiguous 0
26 allegro norm: 262143 collisions, tests: passed 0, failed 6, ambiguous 0
27 allegro high: 153358 collisions, tests: passed 0, failed 6, ambiguous 0
28 
29 rand8 low: 7 collisions, tests: passed 3, failed 2, ambiguous 1
30 rand8 norm: 24 collisions, tests: passed 5, failed 0, ambiguous 1
31 rand8 high: 51 collisions, tests: passed 0, failed 5, ambiguous 1
32 
33 rand9 low: 13 collisions, tests: passed 2, failed 2, ambiguous 2
34 rand9 norm: 18 collisions, tests: passed 6, failed 0, ambiguous 0
35 rand9 high: 40 collisions, tests: passed 1, failed 5, ambiguous 0
36 
37 rand0 low: 9 collisions, tests: passed 5, failed 0, ambiguous 1
38 rand0 norm: 14 collisions, tests: passed 5, failed 0, ambiguous 1
39 rand0 high: 7 collisions, tests: passed 2, failed 3, ambiguous 1

edit: fixed an issue where tests were labelled incorrectly

edit2:
Sorry about the comment at the top.
I've removed it. I was in a bad mood, when I wrote that, and then forgot about it. It was totally uncalled for.

Paul Pridham
Member #250
April 2000
avatar

Oh, the four leading zeroes I mentioned are ascii characters, not bytes.

Hmm, the rand series, though failing some tests, still seems to be doing respectably. I suspect that the failing may be in the actual rand portion of the "avalanche".

orz
Member #565
August 2000

I was confused when I said OAAT was passing... it wasn't, I had it displaying the wrong results for one category which it was failing. Still OAAT did pretty well, along with hash_rand0. I recommend trying hash functions with more than 32 bits of state.

Anyway, it's of course impossible to create a perfect omni-purpose hash function, as there are tradeoffs between speed, quality, and extras (suitability for hardware implementation, cryptographic security, etc). But with a little trial-and-error, a little luck or insight, and maybe even a dab of theory, it ought to be possible to create a hash function that compares favorably to existing ones in almost any specific niche you care to aim at. Good luck.

spellcaster
Member #1,493
September 2001
avatar

So, rand0 seems to be pretty good :)
How does it compare when it comes to speed and memory used?

Also, let's say I'd use one of these algos to create a hashtable with say, 256 entries, would rand0 still be the candidate I want?
I mean, if I'd do a (rand0() & 255) would I still get a good distribution?

--
There are no stupid questions, but there are a lot of inquisitive idiots.

Thomas Fjellstrom
Member #476
June 2000
avatar

I think thats the point of this whole thing, having good distrobution and speed for power of two sized hash tables. unlike pjw that requires say a table thats some "prime" number in size.

--
Thomas Fjellstrom - [website] - [email] - [Allegro Wiki] - [Allegro TODO]
"If you can't think of a better solution, don't try to make a better solution." -- weapon_S
"The less evidence we have for what we believe is certain, the more violently we defend beliefs against those who don't agree" -- https://twitter.com/neiltyson/status/592870205409353730

Paul Pridham
Member #250
April 2000
avatar

lenny, since it's just a hash function, it really uses no memory other than the local variables needed to calculate the hash value. rand8/9 are faster than rand0, so I'd prefer rand8 for typical usage. All of my testing was aimed at power-of-two-sized tables, and I've shown that rand0/8/9()&255 will give you good distribution over such a table (the one I used for testing in the end was 65536 elements)... much better than pjw, crc, or allegro hashes.

orz has shown that rand8 and rand9 break down somewhat with intensive testing, but the tests he used are rather high-octane. ;) The rand functions are quite good when compared alongside the other common hash functions for their intended usage (hashing strings into ^2 tables). They are also faster, which was my main goal. Just don't use them for any serious encryption purposes. :)

orz
Member #565
August 2000

Anything that does well on the tests is likely to do well in real world circumstances. Of course, that's just a guess on my part - I don't have any large database of real-world data to try out. The biggest flaw in the tests (that I can think of, anyway...) is that they only compare hashes of strings of one specific length at time. Also, they look for flaws in all bits of the result, whereas in the real world, often only the high or low bits will be used.

edit: On second thought, the tests also spend a great deal of effort looking for correlations in results for similar strings, which doesn't matter (under some circumstances) in real world hash tables. In short, the tests are overkill for hash functions intended for use in hash tables. I may revise them soon to ignore flaws that are irrelevant for normal hash tables.

1edit: on Athlon XP 2500+
2Performance data, 4 byte strings:
3 md5 performance = 2.4 MB/s
4bobjenkins performance = 11 MB/s
5 crc performance = 37 MB/s
6 pjw performance = 32 MB/s
7 OAAT performance = 31 MB/s
8 superfast performance = 36 MB/s
9 allegro performance = 41 MB/s
10 rand8 performance = 45 MB/s
11 rand9 performance = 42 MB/s
12 rand0 performance = 32 MB/s
13 
14Performance data, 61 byte strings:
15 md5 performance = 24 MB/s
16bobjenkins performance = 38 MB/s
17 crc performance = 88 MB/s
18 pjw performance = 59 MB/s
19 OAAT performance = 72 MB/s
20 superfast performance = 183 MB/s
21 allegro performance = 132 MB/s
22 rand8 performance = 217 MB/s
23 rand9 performance = 151 MB/s
24 rand0 performance = 80 MB/s
25 
26Performance data, 1024 byte strings:
27 md5 performance = 67 MB/s
28bobjenkins performance = 55 MB/s
29 crc performance = 96 MB/s
30 pjw performance = 73 MB/s
31 OAAT performance = 92 MB/s
32 superfast performance = 288 MB/s
33 allegro performance = 155 MB/s
34 rand8 performance = 334 MB/s
35 rand9 performance = 185 MB/s
36 rand0 performance = 88 MB/s

 1   2 


Go to: