|
|
This thread is locked; no one can reply to it.
|
1
2
|
| Hash function shootout! |
|
Paul Pridham
Member #250
April 2000
|
Lately I have been looking into hash functions, as I wanted something that would distribute evenly over a power-of-two-sized table, using ASCII key values, and would be fast.
Some of the common and popular hash functions I tried performed poorly, so I decided to try writing my own based on a hunch. And whaddya know, it seems to kick some butt! Below is some test code which runs a variety of tests on a bunch of hash functions and spits out some statistics and such. The functions I wrote are the hash_rand*() ones, and in particular, hash_rand2() seems to be about the best one to use. Also included are hash functions PJW, CRC, One At A Time, Super Fast Hash, and the Allegro hash function! [code] #include <stdio.h> #include <string.h> #include <time.h> #include <stdlib.h> #define TEST_COUNT 100000000 #define TABLE_SIZE 4096 #define KEY_COUNT (TABLE_SIZE*5) #define KEY_LEN 6 #define MASK (TABLE_SIZE-1) char test[KEY_COUNT*KEY_LEN]; int key_hits[TABLE_SIZE]; unsigned int hash_rand(char *key, unsigned int len) { unsigned int i, seed=0; for(i=0; i<len; i++) { seed=seed<<2; seed+=key[i]; } seed = seed * 1103515245 + 12345; return (unsigned int)(seed>>16) & 65535; } // ** WINNER! ** unsigned int hash_rand2(char *key, unsigned int len) { unsigned int i, seed=0; for(i=0; i<len; i++) { seed=seed<<2; seed+=key[i]; } return seed * 1103515245 + 12345; } unsigned int hash_rand3(char *key, unsigned int len) { unsigned int i, seed=0; for(i=0; i<len; i++) { seed=seed<<2; seed+=key[i]; } return (seed+1) * 1103515245 + 12345; } unsigned int hash_rand4(char *key, unsigned int len) { unsigned int i, seed=0; for(i=0; i<len; i++) { seed+=key[i]; } return (seed+1) * 1103515245 + 12345; } unsigned int hash_crc(char *key, unsigned int len) { unsigned int hash=0, i, highorder; for (i=0; i<len; ++i) { highorder = hash & 0xf8000000; hash = hash<<8; hash ^= highorder>>27; hash ^= key[i]; } return hash; } unsigned int hash_pjw(char *key, unsigned int len) { unsigned int h = 0, g, i; for(i = 0; i<len; i++) { h = (h << 4) + key[i]; if((g = h & 0xf0000000)) { h = h ^ (g >> 24); h = h ^ g; } } return h; } // "One At A Time" hash unsigned int hash_oaat(char *key, unsigned int len) { unsigned int hash, i; for (hash=0, i=0; i<len; ++i) { hash += key[i]; hash += (hash << 10); hash ^= (hash >> 6); } hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return hash; } #define HASH_ROTATION 3 unsigned int hash_allegro(char *key, unsigned int len) { unsigned int hash=0, i; for (i = 0; i<len; i++) { hash = ((hash >> HASH_ROTATION) | ((hash << (16 - HASH_ROTATION)) & 0xffff)) ^ (int)key[i]; } return hash; } typedef unsigned short uint16; #define get16bits(d) (*((const uint16*) (d))) unsigned int hash_superfast(char * data, unsigned int len) { unsigned int hash = 0, tmp; int rem; if (len <= 0 || data == NULL) return 0; rem = len & 3; len >>= 2; /* Main loop */ for (;len > 0; len--) { hash += get16bits (data); tmp = (get16bits (data+2) << 11) ^ hash; hash = (hash << 16) ^ tmp; data += 2*sizeof (uint16); hash += hash >> 11; } /* Handle end cases */ switch (rem) { case 3: hash += get16bits (data); hash ^= hash << 16; hash ^= data[sizeof (uint16)] << 18; hash += hash >> 11; break; case 2: hash += get16bits (data); hash ^= hash << 11; hash += hash >> 17; break; case 1: hash += *data; hash ^= hash << 10; hash += hash >> 1; } /* Force "avalanching" of final 127 bits */ hash ^= hash << 3; hash += hash >> 5; hash ^= hash << 2; hash += hash >> 15; hash ^= hash << 10; return hash; } void clear_hits() { memset(key_hits, 0, TABLE_SIZE*sizeof(int)); } int get_hits(int *chain, int *unused) { int i, collisions=0; *chain=0; *unused=0; for(i=0; i<TABLE_SIZE; i++) { if(key_hits[i]>1) { collisions+=key_hits[i]-1; } if(key_hits[i]>*chain) { *chain=key_hits[i]; } if(!key_hits[i]) { (*unused)++; } } return collisions; } void emit(char *ptr, int len) { int i; for(i=0; i<len; i++) { putchar(*(ptr+i)); } } void show_distribution() { int i; int sum=0, count=0; int tenth=0; int total=0; for(i=0; i<TABLE_SIZE; i++) { sum+=key_hits[i]; total+=key_hits[i]; if(count>=TABLE_SIZE/10) { count=0; tenth++; printf("%2d: %2d (%.4f)\n", tenth, sum, ((float)sum/KEY_COUNT)*100); sum=0; } count++; } } float get_variance() { int i; int sum=0, count=0; int tenth=0; int total=0; float mean; float variance = 0; for(i=0; i<TABLE_SIZE; i++) { total+=key_hits[i]; count++; } mean = (float)total / (float)count; for(i=0; i<TABLE_SIZE; i++) { float d = key_hits[i] - mean; variance += d * d; } variance /= (float)count; return variance; } void test_hash(char *name, unsigned int (*func)(char*, unsigned int)) { unsigned int hash, i, longest, unused; clear_hits(); for(i=0; i<KEY_COUNT; i++) { hash=func(&test[i*KEY_LEN], KEY_LEN); key_hits[hash&MASK]++; } printf("%s distribution:\n", name); show_distribution(); printf("%s collisions: %d\n", name, get_hits(&longest, &unused)); printf("%s unused: %d\n", name, unused); printf("%s longest chain: %d\n", name, longest); printf("%s variance: %f\n", name, get_variance()); } void time_hash(char *name, unsigned int (*func)(char*, unsigned int)) { unsigned int dur, i; dur=clock(); for(i=0; i<TEST_COUNT; i++) { func(test, KEY_LEN); } dur=clock()-dur; printf("%s %d: %fs\n\n", name, TEST_COUNT, (double)dur/CLOCKS_PER_SEC); } int main() { int i; srand(time(NULL)); for(i=0; i<KEY_COUNT*KEY_LEN; i++) { // printable ASCII // test[i]=32+rand()%94; // lowercase alphabet - PJW does worse here test[i]=97+rand()%26; } printf("* Table size: %d, keys: %d\n\n", TABLE_SIZE, KEY_COUNT); test_hash("crc", hash_crc); time_hash("crc", hash_crc); test_hash("pjw", hash_pjw); time_hash("pjw", hash_pjw); test_hash("OAAT", hash_oaat); time_hash("OAAT", hash_oaat); test_hash("superfast", hash_superfast); time_hash("superfast", hash_superfast); test_hash("allegro", hash_allegro); time_hash("allegro", hash_allegro); test_hash("rand", hash_rand); time_hash("rand", hash_rand); test_hash("rand2", hash_rand2); time_hash("rand2", hash_rand2); test_hash("rand3", hash_rand3); time_hash("rand3", hash_rand3); test_hash("rand4", hash_rand4); time_hash("rand4", hash_rand4); getchar(); return 0; } [/code] And here are the results on my machine (AMD Duron), compiled with MSVC 6: [code] * Table size: 4096, keys: 20480 crc distribution: 1: 1532 (7.4805) 2: 790 (3.8574) 3: 2284 (11.1523) 4: 1683 (8.2178) 5: 2454 (11.9824) 6: 3143 (15.3467) 7: 1542 (7.5293) 8: 2929 (14.3018) 9: 1491 (7.2803) 10: 2632 (12.8516) crc collisions: 20032 crc unused: 3648 crc longest chain: 82 crc variance: 224.945313 crc 100000000: 6.439000s pjw distribution: 1: 2399 (11.7139) 2: 1514 (7.3926) 3: 1256 (6.1328) 4: 1227 (5.9912) 5: 1542 (7.5293) 6: 2325 (11.3525) 7: 2701 (13.1885) 8: 2427 (11.8506) 9: 2381 (11.6260) 10: 2683 (13.1006) pjw collisions: 16555 pjw unused: 171 pjw longest chain: 19 pjw variance: 11.394043 pjw 100000000: 7.521000s OAAT distribution: 1: 2081 (10.1611) 2: 2066 (10.0879) 3: 1986 (9.6973) 4: 1990 (9.7168) 5: 2055 (10.0342) 6: 2055 (10.0342) 7: 2098 (10.2441) 8: 1998 (9.7559) 9: 2077 (10.1416) 10: 2040 (9.9609) OAAT collisions: 16411 OAAT unused: 27 OAAT longest chain: 15 OAAT variance: 4.978516 OAAT 100000000: 9.043000s superfast distribution: 1: 2067 (10.0928) 2: 1996 (9.7461) 3: 2026 (9.8926) 4: 2063 (10.0732) 5: 1979 (9.6631) 6: 2092 (10.2148) 7: 2071 (10.1123) 8: 2031 (9.9170) 9: 1996 (9.7461) 10: 2136 (10.4297) superfast collisions: 16407 superfast unused: 23 superfast longest chain: 16 superfast variance: 5.103516 superfast 100000000: 6.359000s allegro distribution: 1: 1834 (8.9551) 2: 2145 (10.4736) 3: 2025 (9.8877) 4: 2034 (9.9316) 5: 2276 (11.1133) 6: 1994 (9.7363) 7: 2159 (10.5420) 8: 1983 (9.6826) 9: 1830 (8.9355) 10: 2177 (10.6299) allegro collisions: 16423 allegro unused: 39 allegro longest chain: 14 allegro variance: 5.588867 allegro 100000000: 6.589000s rand distribution: 1: 2012 (9.8242) 2: 2034 (9.9316) 3: 1970 (9.6191) 4: 2053 (10.0244) 5: 2132 (10.4102) 6: 2107 (10.2881) 7: 2014 (9.8340) 8: 2024 (9.8828) 9: 2041 (9.9658) 10: 2071 (10.1123) rand collisions: 16416 rand unused: 32 rand longest chain: 15 rand variance: 5.151367 rand 100000000: 4.437000s rand2 distribution: 1: 2044 (9.9805) 2: 2095 (10.2295) 3: 1966 (9.5996) 4: 2037 (9.9463) 5: 2070 (10.1074) 6: 1963 (9.5850) 7: 2080 (10.1563) 8: 2046 (9.9902) 9: 2114 (10.3223) 10: 2038 (9.9512) rand2 collisions: 16428 rand2 unused: 44 rand2 longest chain: 16 rand2 variance: 5.331055 rand2 100000000: 4.446000s rand3 distribution: 1: 2086 (10.1855) 2: 1957 (9.5557) 3: 2039 (9.9561) 4: 2079 (10.1514) 5: 1965 (9.5947) 6: 2074 (10.1270) 7: 2044 (9.9805) 8: 2114 (10.3223) 9: 2042 (9.9707) 10: 2048 (10.0000) rand3 collisions: 16428 rand3 unused: 44 rand3 longest chain: 16 rand3 variance: 5.331055 rand3 100000000: 4.416000s rand4 distribution: 1: 2038 (9.9512) 2: 1916 (9.3555) 3: 2008 (9.8047) 4: 2049 (10.0049) 5: 2076 (10.1367) 6: 2025 (9.8877) 7: 1948 (9.5117) 8: 2321 (11.3330) 9: 2043 (9.9756) 10: 2056 (10.0391) rand4 collisions: 20357 rand4 unused: 3973 rand4 longest chain: 459 rand4 variance: 1523.432129 rand4 100000000: 4.437000s [/code] Feel free to run your own tests, pick it apart, and finally bow down to hash_rand2() as teh ultimaet hash and submit to using it in your own projects! :D ;) ---- |
|
Elias
Member #358
May 2000
|
Here's results from me, P4, compiled with -march=pentium4 -O3 with gcc 4.0, and with the key length changed from 6 to 32:
superfast now outperforms rand2. As we talked on IRC, there may be a way to get around the byte by byte adding with rand2 though, which likely is where superfast gets an advantage with the 16bit at once adding. -- |
|
Paul Pridham
Member #250
April 2000
|
As you know, Elias, (since we collaborated on the function Here's the function:
And here are my results for a random lowercase alphabetic key of 63 characters:
---- |
|
orz
Member #565
August 2000
|
Those are not good hash functions. hash_rand collides for strings like this pair: hash_rand2 and hash_rand3 collide on exactly the same things as hash_rand. hash_rand4 is worse than the previous ones. any reordering of letters results in the same hash. hash_rand5 has the same flaws as hash_rand 1-3, but it requires slightly longer strings: or, here's an example using the other flaw mentioned for hash_rand 1-3: "a???z???" Also, for short strings, the latter bytes do not effect the lower bits of the results well. For instance, the difference between "abc " and "abc0" is only in the uppermost 4 bits of the hash. I recommend you start by reading some on the subject... |
|
Bob
Free Market Evangelist
September 2000
|
Once you get your string hash function going, you may want to modify my hash code to manage hash sets. -- |
|
Paul Pridham
Member #250
April 2000
|
Hey, thanks for bursting my bubble! Heh, no really, that's exactly why I posted this. I figured there were some gotchas with my functions (other than the obvious ones that my testing revealed for rand4(), which was a test case in badness), especially since they seem to perform so well for the type of cases I'll use them for (and for which the tests are a little biased). rand5() is definitely the best up to this point, and with a bit of tweaking I'm sure the early characters influence will persist. Even as it stands, I'm pretty happy with the results. Nevertheless, it looks like I'm starting on rand6() now... Bob, thanks for posting your hash map code! It looks pretty interesting, and I'll definitely give some serious perusal and tweaking (here I was about to write a chained hash table EDIT: orz, I've confirmed the first two flaws stated for rand1-3 (early letters, cancelling), and the 2nd flaw mentioned for rand5 (a???z???) but have been unable to confirm the early letters flaw in rand5 using a longer string. I've tried up to a 200 character string with only the first letter different, and I'm still getting unique hashes. Can you give me a working example of this flaw? ---- |
|
orz
Member #565
August 2000
|
stupid smileys make it difficult to read the second example for hash_rand5... Quote: orz, I've confirmed the first two flaws stated for rand1-3 (early letters, cancelling), and the 2nd flaw mentioned for rand5 (a???z???) but have been unable to confirm the early letters flaw in rand5 using a longer string. I've tried up to a 200 character string with only the first letter different, and I'm still getting unique hashes. Can you give me a working example of this flaw? Hm... strange. The two samples (100 characters long) I posted for the early-letters flaw in rand5 both produce a hash of 1675257485 in my test. I'm using exactly the code posted in your earlier post for hash_rand5, except I added a "const" keyword to prevent it from objecting to taking a literal string in C++ (I pasted the code into one of my C++ projects). This is my test code: const char *str1 = "his really really really REALLY long test string is super super SUPER DUPER great (yes it really is)"; const char *str2 = "her really really really REALLY long test string is super super SUPER DUPER great (yes it really is)"; int len1 = strlen(str1); int len2 = strlen(str2); int h1 = hash_rand5(str1, len1); int h2 = hash_rand5(str2, len2); printf("%11d len=%2d str=\"%s\"\n", h1, len1, str1); printf("%11d len=%2d str=\"%s\"\n", h2, len2, str2); Looking at the code for hash_rand5, this should work for any two strings that have the same last 64 characters and the same (length % 4). Here's a suggestion for testing: Bob Jenkins covered most of this somewhere in his stuff, but you've already read some of his stuff and I've already linked him, so I'll try covering it myself: 2. "seed = seed ^ (seed >> 2);" 3. "seed = (seed << 1) ^ (seed >> 2);" 4. "seed = (seed << 7) | (seed >> 25);" 5. "seed = seed * 1103515245;" Guidelines: B. (important guideline) C. (important guideline) D. (guideline) E. (my own observation/intuition/recommendation/whatever) F. (guideline) F. (my own observation/intuition/recommendation/whatever) G. (my own observation/intuition/recommendation/whatever) |
|
CGamesPlay
Member #2,559
July 2002
|
Quote: Addition mixes bits much slower Wait, you're saying that multiplcation is faster time-wise than addition? Or did you mean addition isn't as effective as multiplication? -- Ryan Patterson - <http://cgamesplay.com/> |
|
orz
Member #565
August 2000
|
Um, what I was trying to say is that, with multiplication, bits spread to the left very quickly on a per-multiply basis, whereas with addition, they spread to the left by maybe one bit per addition. To illustrate, here's a loop that is bad because of that: for (int i = 0; i < length; i++) { seed += key<i>; //bad! both addition and multiplication only spread bits leftwards! seed = seed * 1103515245 + 12345; } compared to a good loop, or at least, a loop that doesn't have that flaw: for (int i = 0; i < length; i++) { seed += key<i>; //okay: the multiply spreads bits leftwards and // the circular shift moves the bits around seed = seed * 1103515245; seed = (seed >> 17) | (seed << 15); } For the bad loop, the low bit of the final output is dependant ONLY upon the low bits of the input. For the good loop, the low bit of the output can be effected by any bit of the input, though a few bits in key[length-1] may have only a minimal effect on it. |
|
CGamesPlay
Member #2,559
July 2002
|
So how effective would something like this be? -- Ryan Patterson - <http://cgamesplay.com/> |
|
Paul Pridham
Member #250
April 2000
|
Well, here's the latest incarnation, and I think I'm pretty happy with this. I haven't been able to break the distribution using any orz-like tricks, and it's still a touch faster than SuperFastHash. PLUS, it still uses the rand() functionality to avalanche at the end, thus keeping its classic rand appeal.
---- |
|
orz
Member #565
August 2000
|
Paul Pridham: CGamesPlay: Subtraction does not fix the leftward-only issue - it is basically the same as addition, a leftwards only operation, and it mixes at the same rate. ("a -= b;" is generally equivalent to "a += 1 + ~b;", and ~ neither moves nor mixes bits) Really, the only operations I can think of that you can get rightward movement or mixing out of are right-shift, circular shift (implemented with right-shift in C some many assembly languages), carry-based-things (in C often implemented with rightshift, though in assembly often dedicated opcodes are used, like ADC (add-with-carry)), indirection / table lookups (if the word size is large or the array size is small this may need to get combined with a right-shift to get the high bits of the input to be mixed in), modulo (which is too slow, and should generally not be used with an even modulus), modulo-based things (which range from too slow to way way way too slow), high-word-of-multiplication (which, in C, is generally implemented with floating point values or with right-shifts). |
|
Paul Pridham
Member #250
April 2000
|
orz, ahh... can't believe I overlooked that case. Well anyway, here are three different versions now. One, rand0, mixes every byte, rand8 is as before with the case 3 fix, and rand9 follows your (and Bob Jenkins') lead and assembles 4 bytes before mixing them all together. <code> rem = len & 3; len>>=2; for(i=0; i<len; i++) switch(rem) case 2: case 3: return seed * 1103515245 + 12345; unsigned int hash_rand9(char *key, unsigned int len) rem = len & 3; len>>=2; for(i=0; i<len; i++) seed += key[0]; seed ^= seed >> 7; key+=4; switch(rem) case 2: case 3: return seed * 1103515245 + 12345; unsigned int hash_rand0(char *key, unsigned int len) ---- |
|
orz
Member #565
August 2000
|
It all generally looks correct now. The most incorrect looking things left are that they int is assumed to be a 32 bit type, and (in hash_rand9 only), and char is assumed to get promoted to int when leftshifted. Both of those assumptions are true on most CPUs & compilers these days, but in embedded contexts you can still find some that don't follow those assumptions. int size can be handled with a typedef or #define, promotion assumptions can be handled by explicit typecasting (as Bob Jenkins does in his hash function). Also, you have the case set up so that hash_rand8 and hash_rand9 produce different results right now (in the switch statement at the end, on case 3). It might be preferable if they produced identical results (at least on little-endian machines), so that one could be a drop in replacement for the other or something. |
|
Neil Walker
Member #210
April 2000
|
Have you a specific use for your hash functions? As choosing the fastest might not necessarily be the best. Neil. Neil. wii:0356-1384-6687-2022, kart:3308-4806-6002. XBOX:chucklepie |
|
Paul Pridham
Member #250
April 2000
|
orz, again some good insight -- you've been very helpful. Neil, I have a few different use for my functions, and not all of them will need the runtime speed. Ie. loading bitmaps, samples etc. into named entries of a hash table for configuration purposes. The resources will then be used to fill in the various data structures that will be used at runtime, such as animation sequences, scripts, sound sequences. The speed isn't needed here because the resulting data structures will directly address the various resources. However, having named items accessible to scripts/configuration is nice. For speedier applications, the hash functions will be used as part of my scripting system to speed up dictionary accesses. Also, they will provide a means to support a fast associative array scripting datatype, as in Lua or other prototyped OO languages. As you can see, the latter can be used to support the former as well. There are a ton of uses I'll put the functions to work in, to be sure. Anyway, the final results are in... hash_rand8 > superfasthash! The key sequence I used for this test was as follows: XXXXXsuper really really really really really REALLY long test string is super super super super SUPER DUPER super SUPER DUPER super SUPER DUPER great (yes it really is!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!) ...where XXXXX is a hex count from 00000 to 50000. This key really trashed the lesser hash functions, as you can see from the results below. The previous tests used a much smaller (20480 entries) table of 61-byte-long, randomized, lowercase alphabetical keys which seemed to improve hashing ability all around. No longer. 1* Table size: 65536, keys: 327680
2
3crc distribution:
4 1: 327680 (100.0000)
5 2: 0 (0.0000)
6 3: 0 (0.0000)
7 4: 0 (0.0000)
8 5: 0 (0.0000)
9 6: 0 (0.0000)
10 7: 0 (0.0000)
11 8: 0 (0.0000)
12 9: 0 (0.0000)
1310: 0 (0.0000)
14crc collisions: 327679
15crc unused: 65535
16crc longest chain: 327680
17crc variance: 1638375.000000
18crc 327680: 0.661000s
19
20pjw distribution:
21 1: 137280 (41.8945)
22 2: 126678 (38.6591)
23 3: 0 (0.0000)
24 4: 0 (0.0000)
25 5: 0 (0.0000)
26 6: 0 (0.0000)
27 7: 0 (0.0000)
28 8: 0 (0.0000)
29 9: 0 (0.0000)
3010: 63722 (19.4464)
31pjw collisions: 327531
32pjw unused: 65387
33pjw longest chain: 8176
34pjw variance: 21188.374908
35pjw 327680: 1.102000s
36
37OAAT distribution:
38 1: 32737 (9.9905)
39 2: 32965 (10.0601)
40 3: 32675 (9.9716)
41 4: 32709 (9.9820)
42 5: 32382 (9.8822)
43 6: 32999 (10.0705)
44 7: 32835 (10.0204)
45 8: 32575 (9.9411)
46 9: 32909 (10.0430)
4710: 32872 (10.0317)
48OAAT collisions: 262603
49OAAT unused: 459
50OAAT longest chain: 18
51OAAT variance: 4.988373
52OAAT 327680: 0.971000s
53
54superfast distribution:
55 1: 32778 (10.0031)
56 2: 32636 (9.9597)
57 3: 32784 (10.0049)
58 4: 32966 (10.0604)
59 5: 32757 (9.9966)
60 6: 32617 (9.9539)
61 7: 32832 (10.0195)
62 8: 32907 (10.0424)
63 9: 32545 (9.9319)
6410: 32837 (10.0211)
65superfast collisions: 262567
66superfast unused: 423
67superfast longest chain: 16
68superfast variance: 5.011139
69superfast 327680: 0.511000s
70
71allegro distribution:
72 1: 25512 (7.7856)
73 2: 35930 (10.9650)
74 3: 33438 (10.2045)
75 4: 30797 (9.3985)
76 5: 33029 (10.0797)
77 6: 28070 (8.5663)
78 7: 38488 (11.7456)
79 8: 33444 (10.2063)
80 9: 32694 (9.9774)
8110: 36242 (11.0602)
82allegro collisions: 262972
83allegro unused: 828
84allegro longest chain: 29
85allegro variance: 12.391602
86allegro 327680: 0.701000s
87
88rand distribution:
89 1: 0 (0.0000)
90 2: 0 (0.0000)
91 3: 0 (0.0000)
92 4: 0 (0.0000)
93 5: 0 (0.0000)
94 6: 0 (0.0000)
95 7: 327680 (100.0000)
96 8: 0 (0.0000)
97 9: 0 (0.0000)
9810: 0 (0.0000)
99rand collisions: 327679
100rand unused: 65535
101rand longest chain: 327680
102rand variance: 1638375.000000
103rand 327680: 0.431000s
104
105rand2 distribution:
106 1: 0 (0.0000)
107 2: 0 (0.0000)
108 3: 0 (0.0000)
109 4: 0 (0.0000)
110 5: 327680 (100.0000)
111 6: 0 (0.0000)
112 7: 0 (0.0000)
113 8: 0 (0.0000)
114 9: 0 (0.0000)
11510: 0 (0.0000)
116rand2 collisions: 327679
117rand2 unused: 65535
118rand2 longest chain: 327680
119rand2 variance: 1638375.000000
120rand2 327680: 0.420000s
121
122rand3 distribution:
123 1: 0 (0.0000)
124 2: 0 (0.0000)
125 3: 0 (0.0000)
126 4: 0 (0.0000)
127 5: 0 (0.0000)
128 6: 0 (0.0000)
129 7: 0 (0.0000)
130 8: 327680 (100.0000)
131 9: 0 (0.0000)
13210: 0 (0.0000)
133rand3 collisions: 327679
134rand3 unused: 65535
135rand3 longest chain: 327680
136rand3 variance: 1638375.000000
137rand3 327680: 0.431000s
138
139rand4 distribution:
140 1: 27634 (8.4332)
141 2: 33016 (10.0757)
142 3: 35497 (10.8328)
143 4: 31958 (9.7528)
144 5: 30218 (9.2218)
145 6: 37200 (11.3525)
146 7: 31731 (9.6835)
147 8: 28857 (8.8065)
148 9: 34354 (10.4840)
14910: 37215 (11.3571)
150rand4 collisions: 327515
151rand4 unused: 65371
152rand4 longest chain: 8440
153rand4 variance: 24737.681213
154rand4 327680: 0.441000s
155
156rand5 distribution:
157 1: 0 (0.0000)
158 2: 0 (0.0000)
159 3: 0 (0.0000)
160 4: 0 (0.0000)
161 5: 0 (0.0000)
162 6: 0 (0.0000)
163 7: 0 (0.0000)
164 8: 0 (0.0000)
165 9: 0 (0.0000)
16610: 327680 (100.0000)
167rand5 collisions: 327679
168rand5 unused: 65535
169rand5 longest chain: 327680
170rand5 variance: 1638375.000000
171rand5 327680: 0.300000s
172
173rand6 distribution:
174 1: 32632 (9.9585)
175 2: 32959 (10.0583)
176 3: 32844 (10.0232)
177 4: 32660 (9.9670)
178 5: 32470 (9.9091)
179 6: 32908 (10.0427)
180 7: 32880 (10.0342)
181 8: 32775 (10.0021)
182 9: 32654 (9.9652)
18310: 32871 (10.0314)
184rand6 collisions: 311296
185rand6 unused: 49152
186rand6 longest chain: 49
187rand6 variance: 84.395538
188rand6 327680: 0.441000s
189
190rand7 distribution:
191 1: 327680 (100.0000)
192 2: 0 (0.0000)
193 3: 0 (0.0000)
194 4: 0 (0.0000)
195 5: 0 (0.0000)
196 6: 0 (0.0000)
197 7: 0 (0.0000)
198 8: 0 (0.0000)
199 9: 0 (0.0000)
20010: 0 (0.0000)
201rand7 collisions: 327679
202rand7 unused: 65535
203rand7 longest chain: 327680
204rand7 variance: 1638375.000000
205rand7 327680: 0.340000s
206
207rand8 distribution:
208 1: 32660 (9.9670)
209 2: 32595 (9.9472)
210 3: 32798 (10.0092)
211 4: 32815 (10.0143)
212 5: 32771 (10.0009)
213 6: 32731 (9.9887)
214 7: 32505 (9.9197)
215 8: 32920 (10.0464)
216 9: 32875 (10.0327)
21710: 32986 (10.0665)
218rand8 collisions: 262584
219rand8 unused: 440
220rand8 longest chain: 18
221rand8 variance: 5.009186
222rand8 327680: 0.471000s
223
224rand9 distribution:
225 1: 32660 (9.9670)
226 2: 32595 (9.9472)
227 3: 32798 (10.0092)
228 4: 32815 (10.0143)
229 5: 32771 (10.0009)
230 6: 32731 (9.9887)
231 7: 32505 (9.9197)
232 8: 32920 (10.0464)
233 9: 32875 (10.0327)
23410: 32986 (10.0665)
235rand9 collisions: 262584
236rand9 unused: 440
237rand9 longest chain: 18
238rand9 variance: 5.009186
239rand9 327680: 0.541000s
240
241rand0 distribution:
242 1: 32947 (10.0546)
243 2: 33048 (10.0854)
244 3: 32361 (9.8758)
245 4: 32755 (9.9960)
246 5: 32558 (9.9359)
247 6: 32809 (10.0125)
248 7: 32910 (10.0433)
249 8: 32640 (9.9609)
250 9: 32930 (10.0494)
25110: 32682 (9.9738)
252rand0 collisions: 262574
253rand0 unused: 430
254rand0 longest chain: 17
255rand0 variance: 4.976532
256rand0 327680: 0.951000s
As you can see, the later rand-family hash functions held their own against the big boys, producing comparable results, and even toppled the venerable allegro hash in distribution ability. rand8 was the winner in speed overall for the "good" functions, and was comparable for the other stats. Next I'll have to take another peek at Bob's hashmap code. ---- |
|
orz
Member #565
August 2000
|
Oh, also, those hash functions are unaffected by leading 0s, at least if they occur in sets of four. Consider initializing seed to len (as Bob Jenkins does if I recall correctly), or some other such fix for that. Here is the results of a test I wrote. The test generates a random 50 byte string of binary data, hashes it, changes a random bit, hashes it again, etc, (1<<18) times. The complication is, it checks that it hasn't accidentally gone back to a string it used earlier. For "norm", it changes any bit in the string, for "low" it changes only the first 48 bits, for "high" it changes only the last 48 bits. The result is the number of hashes produced that were already produced earlier.
Any value up to 15 can be reasonably dismissed as random fluctuations. Thus, OAAT and rand0 produce perfect results, superfast, rand8, and rand9 produce acceptable results, and everything else produces bad results. "md5", and "bobjenkins" are added as sanity checks. I'll write a more thorough test function tomorrow. |
|
Paul Pridham
Member #250
April 2000
|
Ahh, that's great. Could you post your test code when you've finished, so I can cobble together a comprehensive mega-test? And do you mean the leading 4 zeroes in the key, or the seed? ---- |
|
orz
Member #565
August 2000
|
Lets see if this works... Everything specific to your stuff is in h.cpp. The rest of it is random stuff related to random number generation and testing, which shares a bunch of code with this since it was a quick hack to get the random number tests working on hash functions. Using longer samples (up to 16 million hashes), hash_rand0 failed a few tests, but OAAT still prevailed. I've started a 256-million-hash test in an effort to get OAAT to break down and cry, but it will be a while before that finishes. Four leading 0s on the "key" will produce an identical result, like these two string "ab", "\0\0\0\0ab". If all went well, that should produce output like this:
edit: fixed an issue where tests were labelled incorrectly edit2: |
|
Paul Pridham
Member #250
April 2000
|
Oh, the four leading zeroes I mentioned are ascii characters, not bytes. Hmm, the rand series, though failing some tests, still seems to be doing respectably. I suspect that the failing may be in the actual rand portion of the "avalanche". ---- |
|
orz
Member #565
August 2000
|
I was confused when I said OAAT was passing... it wasn't, I had it displaying the wrong results for one category which it was failing. Still OAAT did pretty well, along with hash_rand0. I recommend trying hash functions with more than 32 bits of state. Anyway, it's of course impossible to create a perfect omni-purpose hash function, as there are tradeoffs between speed, quality, and extras (suitability for hardware implementation, cryptographic security, etc). But with a little trial-and-error, a little luck or insight, and maybe even a dab of theory, it ought to be possible to create a hash function that compares favorably to existing ones in almost any specific niche you care to aim at. Good luck. |
|
spellcaster
Member #1,493
September 2001
|
So, rand0 seems to be pretty good Also, let's say I'd use one of these algos to create a hashtable with say, 256 entries, would rand0 still be the candidate I want? -- |
|
Thomas Fjellstrom
Member #476
June 2000
|
I think thats the point of this whole thing, having good distrobution and speed for power of two sized hash tables. unlike pjw that requires say a table thats some "prime" number in size. -- |
|
Paul Pridham
Member #250
April 2000
|
lenny, since it's just a hash function, it really uses no memory other than the local variables needed to calculate the hash value. rand8/9 are faster than rand0, so I'd prefer rand8 for typical usage. All of my testing was aimed at power-of-two-sized tables, and I've shown that rand0/8/9()&255 will give you good distribution over such a table (the one I used for testing in the end was 65536 elements)... much better than pjw, crc, or allegro hashes. orz has shown that rand8 and rand9 break down somewhat with intensive testing, but the tests he used are rather high-octane. ---- |
|
orz
Member #565
August 2000
|
Anything that does well on the tests is likely to do well in real world circumstances. Of course, that's just a guess on my part - I don't have any large database of real-world data to try out. The biggest flaw in the tests (that I can think of, anyway...) is that they only compare hashes of strings of one specific length at time. Also, they look for flaws in all bits of the result, whereas in the real world, often only the high or low bits will be used. edit: On second thought, the tests also spend a great deal of effort looking for correlations in results for similar strings, which doesn't matter (under some circumstances) in real world hash tables. In short, the tests are overkill for hash functions intended for use in hash tables. I may revise them soon to ignore flaws that are irrelevant for normal hash tables.
|
|
|
1
2
|