/*
* Sample hash functions for CS70. Most of these functions hash
* C-style strings (string), although a few work with integers. Not
* all of them have been extensively tested to be sure they generate
* good hash values.
*
* All of the functions fall into one of two classes: integer hashers
* and string hashers. The integer hashers all take a signed integer
* key; the string hashers take a C-style string (string).
*
* All functions return an unsigned integer suitable for use as a
* hash-table index. With one exception (hashStringBase256), this
* integer can be larger than the size of the hash table, so the
* caller must reduce the integer modulo the table size.
*
* The functions are:
*
* hashStringCRC Uses a method sometimes called "modified CRC".
* This is a simple, fast algorithm that
* generates good hash values. It is closely
* related to the hash algorithm used in ispell.
* hashStringBase256 Treats a string as a very large base-256
* number and returns that number modulo the
* table size. This is the only function that
* accepts the table size as an argument, and the
* only one that guarantees that its result will
* be in the range 0 to tableSize - 1. It should
* produce good results but has not been tested
* extensively.
* hashPJW Uses a method recommended by Aho, Sethi, and
* Ullman's book on compilers. Seems to generate
* pretty good values.
* hashIntegerMultiply Uses a multiplicative method. Not extensively
* tested.
* hashStringBUZ Uses a randomization method that is intended
* to produce better hash values. Seems to
* generate pretty good values, but doesn't do
* much better with the ispell dictionary than
* hashStringCRC.
* hashIntegerBUZ Uses the same randomization method by treating
* an integer as a 4-character string.
* hashIntegerAbs Returns the absolute value of the hashed
* integer. Not extensively tested. Should work
* well if the integer keys are well distributed
* and the table size is prime.
* hashStringPreiss Uses the method recommended in Preiss's book
* on data structures. Not extensively tested.
* Probably produces poor hash values for long
* strings that differ only in the leading
* characters, especially if those characters are
* transpositions of each other.
* hashStringWeiss1 Uses the method recommended in the first
* edition of Weiss's book on data structures.
* Not extensively tested. Produces poor hash
* values for long strings that differ only in
* the leading characters.
* hashStringWeiss2 Uses the method recommended in the second
* edition of Weiss's book on data structures.
* Not extensively tested. Produces poor hash
* values for long strings that differ only in
* the leading characters.
* hashStringFNV Uses the Fowler/Noll/Vo hash function, which
* is reputed to produce excellent results. Not
* extensively tested.
*/
#include "hashfuncs.hh"
#include
#include
/*
* Table of Contents:
*/
unsigned int hashStringCRC(const string& key);
// Hash a string using the fake-CRC XOR method
unsigned int hashStringBase256(const string& key, unsigned int tableSize);
// Hash a string by interpreting it in base 256
unsigned int hashPJW(const string& key);
// Hash a string with Aho/Sethi/Ullman's method
unsigned int hashIntegerMultiply(int key);
// Hash an integer multiplicatively
unsigned int hashStringBUZ(const string& key);
// Hash a string with the BUZ method
unsigned int hashIntegerBUZ(int key);
// Hash an integer with the BUZ method
static unsigned int hashBUZ(const unsigned char* key, int keyLength);
// Helper function for BUZ hashing
unsigned int hashIntegerAbs(int key);
// Hash an integer by taking the absolute value
unsigned int hashStringPreiss(const string& key);
// Hash a string using Preiss's method
unsigned int hashStringWeiss(const string& key);
// Hash a string using Weiss's method
unsigned int hashStringFNV(const string& key);
// Hash a string with Fowler/Noll/Vo's method
/*
* Constants used by the various hash methods. Constants marked with
* a "32-BIT" comment are predicated on the assumption of a 32-bit
* word, and won't work on a 16-bit or 64-bit machine.
*/
static const unsigned int BYTE_WIDTH = 8;
// Number of bits in a byte
static const unsigned int WORD_WIDTH = sizeof (int) * BYTE_WIDTH;
// Number of bits in a machine word (int), used
// ..by several algorithms.
static const unsigned int CRC_HASH_SHIFT = 5;
// How much to shift hash by, per char hashed
static const unsigned int PJW_HASH_SHIFT = 4;
// How much to shift hash by, per char hashed
static const unsigned int PJW_HASH_RIGHT_SHIFT = 24;
// Right-shift amount, if top 4 bits NZ
// 32-BIT
static const unsigned int PJW_HASH_MASK = 0xf0000000;
// Mask for extracting top 4 bits
// 32-BIT
static const unsigned int PREISS_HASH_SHIFT = 6;
// How much to shift hash by, per char hashed
static const unsigned int PREISS_HASH_MASK =
~0U << (WORD_WIDTH - PREISS_HASH_SHIFT);
// Mask for extracting top bits
static const unsigned int WEISS_HASH_SHIFT = 5;
// How much to shift hash by, per char hashed
static const unsigned int WEISS_HASH_MULTIPLIER = 37;
// How much to multiply by, per char hashed
/*
* The FNV hash function depends on the width of the machine word.
* This code is written for 32-bit machines. For a larger machine,
* enable the appropriate constants and change the use of the
* constants in the code.
*
* Note: WORD_BIT is supposed to be defined by the ANSI standard, but
* some compilers don't support it. If it's undefined, we will assume
* that it is 32 (true for most computers in the early 2000's).
*/
#ifndef WORD_BIT
#define WORD_BIT 32
#endif // WORD_BIT
#if WORD_BIT == 32
static const unsigned int FNV_HASH_OFFSET = 2166136261u;
// Where to start FNV hash
static const unsigned int FNV_HASH_MULTIPLIER = 6777619u;
// How much to multiply for FNV hash
#endif // WORD_BIT
#if WORD_BIT == 64
static const unsigned int FNV_HASH_OFFSET = 14695981039346656037u;
// Where to start FNV hash
static const unsigned int FNV_HASH_MULTIPLIER = 1099511628211u;
// How much to multiply for FNV hash
#endif // WORD_BIT
#if WORD_BIT == 128
static const unsigned int FNV_HASH_OFFSET =
275519064689413815358837431229664493455u;
// Where to start FNV hash
static const unsigned int FNV_HASH_MULTIPLIER = 309485009821345068724781401u;
// How much to multiply for FNV hash
#endif // WORD_BIT
#if WORD_BIT == 256
static const unsigned int FNV_HASH_OFFSET =
100029257958052580907070968620625704837092796014241193945225284501741471925557u;
// Where to start FNV hash
static const unsigned int FNV_HASH_MULTIPLIER =
374144419156711147060143317175368453031918731002211u;
// How much to multiply for FNV hash
#endif // WORD_BIT
/*
* Constants for the BUZ algorithm. BUZ_INIT is the initial value of
* the hash function. The lookup table is used to widen and randomize
* the bytes of the key. All values were gotten from /dev/random on a
* Linux box.
*/
const unsigned int BUZINIT = 0x7b4402a2;
static unsigned int BUZ_Table[256] =
{
0xc70bb269, 0x13299943, 0xe9cee5e1, 0x32119a79,
0xc2365851, 0x169ee8e3, 0x094c1ed8, 0x58e1d4e0,
0x96eb1762, 0xfe296797, 0x89082f47, 0x27d5078d,
0x8ebb9de0, 0x14df49e0, 0x38d21c63, 0x1f5b2770,
0xaa6b0150, 0x7b0b656e, 0x98d37bc2, 0x4d7e85ec,
0x987910e2, 0xb8cbac89, 0xa3f664a3, 0xeca3003c,
0x7b364137, 0xb2a6edae, 0x5ef72906, 0x64a9e7b7,
0x28cd6520, 0xca3c72df, 0x57398ce2, 0x8db893d4,
0x0a5995cd, 0x2d109fb4, 0x0491162f, 0xb3488737,
0x6fc4eb03, 0x9903cb21, 0xe82ff831, 0xb03ff8e5,
0x054836ca, 0x5335e6f8, 0x01396c2a, 0xf9000899,
0x03ed9d63, 0x2bf6946b, 0x9097fa8b, 0xacd8dfc7,
0x8488b8a6, 0x0e39cd2e, 0xac1a4517, 0xcd49e035,
0xe98b7e7b, 0xd3571502, 0xd602805e, 0xe7143cfe,
0x46db0a6b, 0x0a4c9ebe, 0x4e2e1ca7, 0x3040fc62,
0xe8818c02, 0x37155e7b, 0xe44ba138, 0x43cacdd4,
0x53d986ba, 0xdd4dca35, 0x0f680f71, 0x6c1a551e,
0x74263e95, 0xcfc4f5d5, 0x37b8ef45, 0xc00ac71d,
0x3b059e0d, 0x208bc754, 0x41335fbe, 0x785a0ffc,
0x189f024f, 0xd669c2d8, 0xe1b20f87, 0xba2550da,
0x10167369, 0x85fad38f, 0x97d20e4e, 0x5bc0da5e,
0x80799570, 0x93eb4058, 0x139042a6, 0x40b34bf6,
0x15c21dfa, 0x8f852660, 0xa3d20fb3, 0x3d175cf9,
0x792441a8, 0xdc5e71b5, 0x925f6350, 0x66e8d08b,
0xc4606b59, 0x85d8b88c, 0x1ea4f459, 0x664f62bc,
0x77407de3, 0x73d158ca, 0xb76ab172, 0xe9ed1aeb,
0x93dc2009, 0xeb9da6ac, 0x3d26cf05, 0x675132bc,
0xc29196fe, 0x2a62486f, 0x914e75e1, 0xa1c31883,
0x1c28291c, 0xc73c668c, 0xf4ac07e6, 0x87c9a9ac,
0xb7196ea7, 0x67cb7fa2, 0x55987797, 0x29ce38ea,
0x427361b3, 0x5b5667a6, 0x68a72fb0, 0xcef8235a,
0xd06e8f5b, 0x4d3633f5, 0x214d3a19, 0xbd09ec15,
0x5c61c24b, 0x3928573c, 0x26083ab8, 0x857a5dee,
0x3203e50c, 0x52a1a713, 0xa8270ee2, 0xdfb643a9,
0x7797c1f3, 0x0f8ddc9f, 0x9368de21, 0x638ebd4e,
0xd91808d7, 0x28ce69b8, 0xe424b0ce, 0xfe52fdef,
0x89126c74, 0xdb5f3d91, 0xba488f47, 0x2b15cdb8,
0xa517b0f9, 0x53950632, 0x1159546a, 0xe50f65a3,
0x5f26b5d1, 0x68a3a955, 0xc2b78ea1, 0x49c33701,
0x45457aee, 0xd49b550a, 0x244379b8, 0xec826af5,
0x4fa6e0c9, 0xd4633425, 0x82f0bd85, 0xc23ccc2e,
0xac73e11a, 0xdc94b283, 0x13e59bb2, 0x23b4880e,
0x1d295c45, 0xef67488c, 0x6b74149e, 0xdf90d4ac,
0xfc6e65a9, 0x406a3734, 0x86999303, 0xc73e7180,
0x3c8a0b31, 0x75fa9249, 0xaca5e0e2, 0x4d0cc60d,
0x4b174606, 0x836fb602, 0x4f9fc83a, 0xe16477a7,
0xda1506a9, 0x905b28b7, 0x4229f5c2, 0xdf4c9144,
0x731888e9, 0x2e37421f, 0x0c67c385, 0x44a2e520,
0xfe3ee655, 0x92547582, 0x9525f4d4, 0xdaa8caf8,
0xa25bd583, 0x0e315733, 0xd35fea29, 0xc9cfaa0f,
0xc6bdf7e9, 0xa48b4e01, 0xfd30ffe0, 0xd0f63421,
0x2b84803d, 0xe1b368a4, 0xbae5daa5, 0x4dd6336a,
0xb60c4030, 0x7bb552f1, 0xf1b91481, 0xc8929b82,
0xa1c22bf8, 0xd585aae4, 0x17fb4f6a, 0xa9c0d32e,
0x2036f9b5, 0x3a95d611, 0xd554f25d, 0x3441153f,
0x7fa89ff4, 0x8f91241c, 0x4b2cc5a9, 0xd3035a00,
0x5c80707f, 0xe610fd47, 0xf60958c2, 0xb55a6fe7,
0x3e1ba335, 0x2dead082, 0x1a8877e8, 0xd0791aad,
0x9706ee52, 0xfb1dc525, 0x7fa1ba54, 0x6a3e9f81,
0xa85a906a, 0x86ce1b46, 0x3b05833e, 0xc8d8fdcb,
0x44e606c5, 0x8807beb2, 0xe46047d3, 0x85b9f5f8,
0x56ed0cba, 0x3cf4e646, 0x970fd9dd, 0xd0600895,
0x2d0a5f92, 0x891c4220, 0x017fbfe0, 0x4dde2016,
0x3a6e421d, 0x7e3f2285, 0xdf7956e3, 0x52fdcf83
};
/*
* Hash a string using the modified CRC method. The basic idea of
* this function is that the hash value is rotated left by 5 bits and
* then the next character is exclusive-or'ed in to the hash value.
* The implementation is complicated by the lack of a rotate operation
* in C++.
*
* This hash function does not work well with table sizes that are a
* power of two.
*/
unsigned int hashStringCRC( // Hash a string
const string& key) // Key to be hashed
{
unsigned int hashValue = 0;
for (string::const_iterator i = key.begin(); i != key.end(); i++)
{
/*
* The following expression could be done in one line, but it
* would be really nasty, and a modern compiler ought to
* generate the same code whether it's one line or several.
* So we'll break it up to make it easier to read.
*
* First, we shift the value left to make room for bits from
* the new key character.
*/
unsigned int leftShiftedValue = hashValue << CRC_HASH_SHIFT;
/*
* Shifting left lost the top bits, so we have to extract and
* position them separately with a right shift. If we were
* writing in assembly, we could do all of this in a single
* rotate instruction, but C++ doesn't give us access to that
* machine operation so we have to do it the hard way.
*/
unsigned int rightShiftedValue =
hashValue >> (WORD_WIDTH - CRC_HASH_SHIFT);
/*
* Put the shifted values together, and then XOR them with the
* next key character (stepping past it in the process).
*/
hashValue = (leftShiftedValue | rightShiftedValue) ^ (unsigned)*i;
}
return hashValue;
}
/*
* Hash a string by interpreting it as a base-256 number. Unlike the
* other hash functions in this file, this function must be passed the
* table size as an argument and performs the modulo function itself
* (otherwise it wouldn't be able to generate a correct result).
*
* Note that instead of multiplying by 256, we shift left by 8 bits
* (BYTE_WIDTH). This is faster on almost all machines, and happens to be
* a bit easier to write in C++ due to the definition of BYTE_WIDTH.
*/
unsigned int hashStringBase256(
const string& key, // Key to be hashed
unsigned int tableSize) // Size of hash table (hash modulus)
{
unsigned int hashValue = 0;
for (string::const_iterator i = key.begin(); i != key.end(); i++)
{
hashValue = (hashValue << BYTE_WIDTH) + (unsigned)*i;
hashValue %= tableSize;
}
return hashValue;
}
/*
* Hash a string using an algorithm taken from Aho, Sethi, and Ullman,
* "Compilers: Principles, Techniques, and Tools," Addison-Wesley,
* 1985, p. 436. PJW stands for Peter J. Weinberger, who apparently
* originally suggested the function.
*
* The basic idea of this algorithm is similar to that of the modified
* CRC algorithm, except that instead of shifting the top bits right
* so that they line up with the newly emptied bottom bits (a rotate),
* the top bits are shifted only far enough to line up with the top
* half of the character just XOR-ed into the hash value.
*/
unsigned int hashPJW( // Hash a string, Aho/Sethi/Ullman
const string& key) // Key to be hashed
{
unsigned int hashValue = 0;
for (string::const_iterator i = key.begin(); i != key.end(); i++)
{
hashValue = (hashValue << PJW_HASH_SHIFT) + (unsigned)*i;
unsigned int rotate_bits = hashValue & PJW_HASH_MASK;
hashValue ^= rotate_bits | (rotate_bits >> PJW_HASH_RIGHT_SHIFT);
}
return hashValue;
}
/*
* Hash an integer by multiplication. This algorithm was suggested by
* Knuth.
*/
unsigned int hashIntegerMultiply( // Hash an integer by multiplication
int key) // Key to be hashed
{
return (unsigned int)key * (unsigned int)(key + 3);
}
/*
* Hash a string using the BUZ algorithm. See the helper function for
* more information on the origins and operation of the underlying
* algorithm.
*
* After the string has been hashed according to the BUZ algorithm,
* the resulting integer is re-hashed twice. Bob Uzgalis claimed in
* his lecture that this produced better hash values.
*
* The BUZ algorithm is the only string-hashing method in this file
* that generates non-sequential hash values when the keys differ by
* one in the last character (e.g., "foo1' and "foo2"). For this
* reason, BUZ is a good choice if you are using linear probing and
* you expect that many of your keys will differ only in the last
* character (such as variable names).
*/
unsigned int hashStringBUZ (
const string& key) // Key to be hashed
{
/*
* Hash the string using the BUZ method (see hashBUZ).
*/
unsigned int hashValue = BUZINIT; // Result
for (string::const_iterator i = key.begin(); i != key.end(); i++)
{
unsigned int ch = *i;
if (hashValue & (1 << (WORD_WIDTH - 1)))
hashValue = ((hashValue << 1) | 1) ^ BUZ_Table[ch];
else
hashValue = (hashValue << 1) ^ BUZ_Table[ch];
}
/*
* Re-hash the hash value twice in hopes of getting more
* randomness. By casting &hashValue into a pointer to unsigned
* character, we effectively treat it as a 4-character string
* inside hashBUZ, and re-hash that string to get a new hash value.
*/
hashValue = hashBUZ((unsigned char*)&hashValue, sizeof hashValue);
return hashBUZ((unsigned char*)&hashValue, sizeof hashValue);
}
/*
* Hash an integer using the BUZ algorithm. There's nothing fancy
* here; see the comments at the end of hashStringBUZ and the comments
* for hashBuz for more information. We hash three times in hopes of
* getting more randomness.
*/
unsigned int hashIntegerBUZ (
int key) /* Key to be hashed */
{
unsigned int hashValue = hashBUZ((unsigned char*)&key, sizeof key);
hashValue = hashBUZ((unsigned char*)&hashValue, sizeof hashValue);
return hashBUZ((unsigned char*)&hashValue, sizeof hashValue);
}
/*
* Do one pass of the BUZ algorithm on a fixed key.
*
* The BUZ algorithm was invented by Robert (Bob) Uzgalis. I don't
* have a reference for it yet, because I learned it from a lecture,
* not a paper. The basic idea is to use a (constant) lookup table of
* random numbers to generate wider random bits for each input
* character.
*
* The innards of the function are very similar to the modified CRC
* method, with one minor and one major change. The minor change is
* that the hash value is rotated only one bit at a time, rather than
* 5. The major change is that instead of XOR-ing the key character
* in directly, the character is used as an index to look up a 32-bit
* random number in a table. The idea is that you get 32 bits of
* information into the hash value at each step, rather than 8. For
* that reason, hashBUZ should work better with very short string keys
* and very small integer keys.
*/
static unsigned int hashBUZ (
const unsigned char* key, /* Key to be hashed */
int keylen) /* Length of the key */
{
unsigned int hashValue = BUZINIT;
while (--keylen >= 0)
{
unsigned int ch = *key++;
if (hashValue & (1 << (WORD_WIDTH - 1)))
hashValue = ((hashValue << 1) | 1) ^ BUZ_Table[ch];
else
hashValue = (hashValue << 1) ^ BUZ_Table[ch];
}
return hashValue;
}
/*
* Hash an integer by taking the absolute value. This is recommended
* in Bruno Preiss, "Data Structures and Algorithms with
* Object-Oriented Design Patterns in C++", Wiley, 1999, p. 210.
*
* I (Prof. Kuenning) do not have any particular reason to believe
* that this is a good hash function.
*/
unsigned int hashIntegerAbs(
int key) /* Key to be hashed */
{
if (key >= 0)
return (unsigned int)key;
else
return (unsigned int)(-key);
}
/*
* Hash a string using the method given in Bruno Preiss, "Data
* Structures and Algorithms with Object-Oriented Design Patterns in
* C++", Wiley, 1999, p. 213.
*
* This is NOT a very good hash function, especially with non-prime
* table sizes.
*
* This function is somewhat related to the modified CRC function.
* However, instead of right-shifting the top bits back down so that
* they have a further effect on the low bits, they are simply folded
* back into the same (top) bits of the new hash value. The net
* result is that the first few characters of the key will only have
* an effect on the top bits of the hash value. If the table size is
* a power of 2, this important early-character information will be
* completely lost when the hash value is taken modulo the table size.
*/
unsigned int hashStringPreiss(
const string& key) // Key to be hashed
{
unsigned int hashValue = 0;
for (string::const_iterator i = key.begin(); i != key.end(); i++)
hashValue = (hashValue & PREISS_HASH_MASK)
^ (hashValue << PREISS_HASH_SHIFT)
^ (unsigned)*i;
return hashValue;
}
/*
* Hash a string using the method given in Mark Allen Weiss,
* "Algorithms, Data Structures, and Problem Solving with C++" (first
* edition), Addison-Wesley, 1996, p. 611.
*
* I (Prof. Kuenning) am suspicious of this hash function, but some
* very quick tests suggest that it may work better than expected.
*
* Again, this function is related to the modified CRC function.
* However, there is no attempt to preserve the top bits of the hash
* value. Instead, the bottom few bits of the function are just the
* XOR of all the input characters. This means that early information in the
* key will be almost completely lost.
*/
unsigned int hashStringWeiss1(
const string& key) // Key to be hashed
{
unsigned int hashValue = 0;
for (string::const_iterator i = key.begin(); i != key.end(); i++)
hashValue = hashValue ^ (hashValue << WEISS_HASH_SHIFT) ^ (unsigned)*i;
return hashValue;
}
/*
* Hash a string using the method given in Mark Allen Weiss,
* "Algorithms, Data Structures, and Problem Solving with C++" (second
* edition), Addison-Wesley, 1999, p. 728.
*
* I (Prof. Kuenning) am very suspicious of this hash function.
*
* Again, this function is related to the modified CRC function.
* However, as with hashStringWeiss1, there is no attempt to preserve
* the top bits of the hash value. Since the multiplier is not a
* power of two, the bottom few bits (about 5) are a bit more
* complicated than the XOR of all the input keys. But again, the
* early information in a long key will be lost.
*/
unsigned int hashStringWeiss2(
const string& key) // Key to be hashed
{
unsigned int hashValue = 0;
for (string::const_iterator i = key.begin(); i != key.end(); i++)
hashValue = hashValue * WEISS_HASH_MULTIPLIER + (unsigned)*i;
return hashValue;
}
/*
* Hash a string using the method of Fowler, Noll, and Vo, given in
* http://www.isthe.com/chongo/tech/comp/fnv.
*
* Again, this function is related to the modified CRC function.
* However, the multiplier has been very carefully selected by
* experimentation.
*/
unsigned int hashStringFNV(
const string& key) // Key to be hashed
{
unsigned int hashValue = FNV_HASH_OFFSET;
for (string::const_iterator i = key.begin(); i != key.end(); i++)
hashValue = hashValue * FNV_HASH_MULTIPLIER ^ (unsigned)*i;
return hashValue;
}