/* * Chained hashing implemention * Copyright (C) 2009 neverfear.org. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include #include #include #include #include // For the hash function itself #include #ifdef linux # include #endif // Definitions, etc #include "chainedhashing.h" #define IS_PRIME 0 #define IS_NOT_PRIME 1 #define IS_ODD(x) (x % 2) #define NUMBER_TO_INDEX(x) (((x - 1) / 2) - 1) #define INDEX_TO_NUMBER(x) (2 * (x + 1)) + 1 // The following defines are used as part of the test mechanism. #define TEST_SEPARATOR "------------------------------" #define REMOVING_F " removing item '%s'\n" #define DESTROYDONE_F " tables have been destroyed\n" #define GETITEMDATA_F " get_item_data (0x%X, %s) = '%s'\n" #define GETITEMCOUNT_F " get_item_count (0x%X) = %d\n" #define GETLOADFACTOR_F " get_load_factor(0x%X) = %f\n" #define HASHTABLESIZE_F " hash_table_size = %d\n" #define INDENT_NUM 2 // The number of indents when printing tables #define RESIZE_FACTOR 2 // The multiplier of the existing table // size to get the max range of the new // table #define INPUT_NUMBER_BASE 10 // The Sieve of Erastothenes based algorithm // limit - must be odd int get_highest_prime(int limit) { char * soeList; int size = 0; int i; int j; int k; double sqrtlimit = sqrt(limit); size = ((limit - 1) / 2); soeList = (char*)calloc(size, sizeof(char)); if (!soeList) { return 0; } j = 3; while (j <= sqrtlimit) { // O(sqrt(N)) // first mark off all for(i = 2; i < limit; i++) { // O(N) k = j * i; if (k >= limit) break; if (IS_ODD(k)) { soeList[NUMBER_TO_INDEX(k)] = IS_NOT_PRIME; } } // find next j - the first index not labeled IS_NOT_PRIME i = 0; for(i = NUMBER_TO_INDEX(j) + 1; i < limit; i++) { // O(N) if (soeList[i] == IS_PRIME) { break; } } j = INDEX_TO_NUMBER(i); } // search backwards for the last prime for(i = size - 1; i >= 0; i--) { // O(N) if (soeList[i] == IS_PRIME) { free(soeList); return INDEX_TO_NUMBER(i); } } free(soeList); return 0; } /** * Create a new hash table of a given size. * @param size The size of the hash table. Should ideally be a prime. * @return A pointer to the newly allocated memory of the hashtable. */ HASHTABLE * create_hash_table(int size) { HASHTABLE * hash_table; hash_table = (HASHTABLE *)calloc(1, sizeof(HASHTABLE)); if (!hash_table) { return NULL; } hash_table->chains = (HASHCHAIN *)calloc(size, sizeof(HASHCHAIN)); hash_table->size = size; hash_table->count = 0; return hash_table; } /** * Create a new hash item. * @param key The hash item key (char *). * @param data A pointer to the data to attach to this item (void *). * @return A pointer to a new hashitem. */ HASHITEM * create_hash_item(char * key, void * data) { HASHITEM * item; item = (HASHITEM *)calloc(1, sizeof(HASHITEM)); item->key = key; // TODO: strdup item->data = data; item->hash = HASH(key); return item; } HASHITEM * clone_hash_item(HASHITEM * from) { HASHITEM * item; item = (HASHITEM *)calloc(1, sizeof(HASHITEM)); item->key = from->key; // TODO: strdup item->data = from->data; item->hash = from->hash; return item; } #define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k)))) #define mix(a, b, c) { \ a -= c; a ^= rot(c, 4); c += b; \ b -= a; b ^= rot(a, 6); a += c; \ c -= b; c ^= rot(b, 8); b += a; \ a -= c; a ^= rot(c,16); c += b; \ b -= a; b ^= rot(a,19); a += c; \ c -= b; c ^= rot(b, 4); b += a; \ } #define final(a, b, c) { \ c ^= b; c -= rot(b,14); \ a ^= c; a -= rot(c,11); \ b ^= a; b -= rot(a,25); \ c ^= b; c -= rot(b,16); \ a ^= c; a -= rot(c,4); \ b ^= a; b -= rot(a,14); \ c ^= b; c -= rot(b,24); \ } #if ( \ ( \ defined(__BYTE_ORDER) && \ defined(__LITTLE_ENDIAN) && \ __BYTE_ORDER == __LITTLE_ENDIAN \ ) || ( \ defined(i386) || \ defined(__i386__) || \ defined(__i486__) || \ defined(__i586__) || \ defined(__i686__) || \ defined(vax) || \ defined(MIPSEL) \ ) \ ) # define HASH_LITTLE_ENDIAN 1 # define HASH_BIG_ENDIAN 0 #elif ( \ defined(__BYTE_ORDER) && \ defined(__BIG_ENDIAN) && \ __BYTE_ORDER == __BIG_ENDIAN \ ) || ( \ defined(sparc) || \ defined(POWERPC) || \ defined(mc68000) || \ defined(sel) \ ) # define HASH_LITTLE_ENDIAN 0 # define HASH_BIG_ENDIAN 1 #else # define HASH_LITTLE_ENDIAN 0 # define HASH_BIG_ENDIAN 0 #endif /* ------------------------------------------------------------------------------- hashlittle() -- hash a variable-length key into a 32-bit value k : the key (the unaligned variable-length array of bytes) length : the length of the key, counting by bytes initval : can be any 4-byte value Returns a 32-bit value. Every bit of the key affects every bit of the return value. Two keys differing by one or two bits will have totally different hash values. The best hash table sizes are powers of 2. There is no need to do mod a prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask. For example, if you need only 10 bits, do h = (h & hashmask(10)); In which case, the hash table should have hashsize(10) elements. If you are hashing n strings (uint8_t **)k, do it like this: for (i=0, h=0; i 12) { a += k[0]; b += k[1]; c += k[2]; mix(a,b,c); length -= 12; k += 3; } #ifndef VALGRIND switch(length) { case 12: c += k[2]; b += k[1]; a += k[0]; break; case 11: c += k[2] & 0xffffff; b += k[1]; a += k[0]; break; case 10: c += k[2] & 0xffff; b += k[1]; a += k[0]; break; case 9: c += k[2] & 0xff; b += k[1]; a += k[0]; break; case 8: b += k[1]; a+=k[0]; break; case 7: b += k[1] & 0xffffff; a += k[0]; break; case 6: b += k[1] & 0xffff; a += k[0]; break; case 5: b += k[1] & 0xff; a += k[0]; break; case 4: a += k[0]; break; case 3: a += k[0] & 0xffffff; break; case 2: a += k[0] & 0xffff; break; case 1: a += k[0] & 0xff; break; case 0: return c; } #else /* make valgrind happy */ k8 = (const uint8_t *)k; switch(length) { case 12: c += k[2]; b += k[1]; a += k[0]; break; case 11: c += ((uint32_t)k8[10]) << 16; case 10: c += ((uint32_t)k8[9]) << 8; case 9: c += k8[8]; case 8: b += k[1]; a += k[0]; break; case 7: b +=((uint32_t)k8[6]) << 16; case 6: b += ((uint32_t)k8[5]) << 8; case 5: b += k8[4]; case 4: a += k[0]; break; case 3: a += ((uint32_t)k8[2]) << 16; case 2: a += ((uint32_t)k8[1]) << 8; case 1: a += k8[0]; break; case 0 : return c; } #endif /* !valgrind */ } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { const uint16_t *k = (const uint16_t *)key; const uint8_t *k8; while (length > 12) { a += k[0] + (((uint32_t)k[1]) << 16); b += k[2] + (((uint32_t)k[3]) << 16); c += k[4] + (((uint32_t)k[5]) << 16); mix(a,b,c); length -= 12; k += 6; } k8 = (const uint8_t *)k; switch(length) { case 12: c += k[4] + (((uint32_t)k[5]) << 16); b += k[2] + (((uint32_t)k[3]) << 16); a += k[0] + (((uint32_t)k[1]) << 16); break; case 11: c += ((uint32_t)k8[10]) << 16; case 10: c += k[4]; b += k[2] + (((uint32_t)k[3]) << 16); a += k[0] + (((uint32_t)k[1]) << 16); break; case 9: c+=k8[8]; case 8: b += k[2] + (((uint32_t)k[3]) << 16); a += k[0] + (((uint32_t)k[1]) << 16); break; case 7: b += ((uint32_t)k8[6]) << 16; case 6: b += k[2]; a += k[0] + (((uint32_t)k[1]) << 16); break; case 5: b += k8[4]; case 4: a += k[0] + (((uint32_t)k[1]) << 16); break; case 3: a += ((uint32_t)k8[2]) << 16; case 2: a += k[0]; break; case 1: a += k8[0]; break; case 0: return c; } } else { const uint8_t *k = (const uint8_t *)key; while (length > 12) { a += k[0]; a += ((uint32_t)k[1]) << 8; a += ((uint32_t)k[2]) << 16; a += ((uint32_t)k[3]) << 24; b += k[4]; b += ((uint32_t)k[5]) << 8; b += ((uint32_t)k[6]) << 16; b += ((uint32_t)k[7]) << 24; c += k[8]; c += ((uint32_t)k[9]) << 8; c += ((uint32_t)k[10]) << 16; c += ((uint32_t)k[11]) << 24; mix(a,b,c); length -= 12; k += 12; } switch(length) { case 12: c += ((uint32_t)k[11]) << 24; case 11: c += ((uint32_t)k[10]) << 16; case 10: c += ((uint32_t)k[9]) << 8; case 9 : c += k[8]; case 8 : b += ((uint32_t)k[7]) << 24; case 7 : b += ((uint32_t)k[6]) << 16; case 6 : b += ((uint32_t)k[5]) << 8; case 5 : b += k[4]; case 4 : a += ((uint32_t)k[3]) << 24; case 3 : a += ((uint32_t)k[2]) << 16; case 2 : a += ((uint32_t)k[1]) << 8; case 1 : a += k[0]; break; case 0 : return c; } } final(a,b,c); return c; } /** * The hash function. Outputs an integer 'hash' of the given key. * @param key The key to convert into an integer. * @return A hash code. */ unsigned int default_hash(char * key) { return abs(hashlittle(key, strlen(key), 0x1337)); } /** * Insert an item into the hash table * @param ht A pointer to the hash table to insert the item into * @param htsize The size of the hash table * @param item A pointer to the item to insert * TODO: Load factor checks & auto-initiate the resizing of the hash table */ void insert_item(HASHTABLE * hash_table, HASHITEM * item) { int hash_code; int index; HASHITEM * current; index = item->hash % hash_table->size; item->next = NULL; if (hash_table->chains[index]) { for(current = hash_table->chains[index]; current->next; current = current->next) { // search for first available slot } // current is now a pointer to the last item in the list current->next = item; } else { hash_table->chains[index] = item; } hash_table->count++; } /** * Removes an item from the list & returns the removed item. * @param hash_table A pointer to the hash table to insert the item into. * @param key The key to search for. * @return The removed item. This will need free()'ing by the user later. */ HASHITEM * remove_item(HASHTABLE * hash_table, char * key) { int hash_code = HASH(key); int index = hash_code % hash_table->size; int keylength = 0; HASHITEM * current = NULL; HASHITEM * previous = NULL; if (!hash_table->chains[index]) { return NULL; } keylength = strlen(key); for(current = hash_table->chains[index]; current; current = current->next) { if (strncmp(current->key, key, keylength) == 0) { if (previous) { previous->next = current->next; } else { // its a top level item hash_table->chains[index] = current->next; } hash_table->count--; return current; } previous = current; } return NULL; } /** * Retrieve the data element of the hashitem with the given key. * @param hash_table A pointer to the hash table to insert the item into. * @param key The key to search for. * @return The hashitem data field with the specified key. */ void * get_item_data(HASHTABLE * hash_table, char * key) { HASHITEM * item = NULL; item = get_item(hash_table, key); if (!item) { return NULL; } return item->data; } /** * Retrieve an entire hashitem with the given key * @param hash_table A pointer to the hash table to insert the item into * @param key The key to search for * @return The hashitem with the specified key */ HASHITEM * get_item(HASHTABLE * hash_table, char * key) { int index = 0; int keylength = 0; HASHITEM * current = NULL; index = HASH(key) % hash_table->size; if (!hash_table->chains[index]) { return NULL; } keylength = strlen(key); for(current = hash_table->chains[index]; current; current = current->next) { if (strncmp(current->key, key, keylength) == 0) { return current; } } return NULL; } /** * Count the number of elements within the supplied hashtable * @param hash_table A pointer to the source hash table * @return The number of items this table contains */ int get_item_count(HASHTABLE * hash_table) { int count = 0; int i = 0; HASHITEM * current = NULL; for(i = 0; i < hash_table->size; i++) { for(current = hash_table->chains[i]; current; current = current->next) { count++; } } return count; } /** * Calculate the current load factor of the supplied hash table. * @param hash_table A pointer to the source hash table. * @return The resulting load factor figure or -1 on failure. */ double get_load_factor(HASHTABLE * hash_table) { double item_count = (double)get_item_count(hash_table); if (hash_table->size) { return item_count / ((double)(hash_table->size)); } else { return -1; } } /** * Takes all the items in srctable, rehashes them and inserts them into destable. * @param srctable A pointer to the source hash table. * @param desttable A pointer to the destination hash table. */ void apply_rehash(HASHTABLE * src_hash_table, HASHTABLE * dst_hash_table) { int i = 0; HASHITEM * current = NULL; for(i = 0; i < src_hash_table->size; i++) { for(current = src_hash_table->chains[i]; current; current = current->next) { insert_item(dst_hash_table, clone_hash_item(current)); } } } /** * Resize and create a new hashtable and rehash the contents of the old one into the new one. * @param old_hash_table A pointer to the original hash table. * @param new_size The desired size of the new hash table. * @return A pointer to the new hashtable. */ HASHTABLE * do_hash_table_resize(HASHTABLE * old_hash_table, int new_size) { HASHTABLE * new_hash_table = create_hash_table(new_size); apply_rehash(old_hash_table, new_hash_table); return new_hash_table; } /** * Frees up the memory of the given hash table including all hashites contained * @param hash_table A pointer to the hash table * @note Will not free up memory allocated for the key or data sections of the hashitem */ void destroy_hash_table(HASHTABLE * hash_table) { // first destroy all the items in the table int i = 0; HASHITEM * current = NULL; HASHITEM * previous = NULL; for(i = 0; i < hash_table->size; i++) { for(current = hash_table->chains[i]; current;) { previous = current; current = current->next; free(previous); } } // now free the table if (hash_table) { free(hash_table); } } /** * Prints the hash table. * @param hash_table A pointer to the hash table. */ void print_hash_table(HASHTABLE * hash_table, int indent) { int i = 0; HASHITEM * current = NULL; char * indent_str = NULL; indent_str = (char *)malloc(indent + 1); memset(indent_str, ' ', indent); indent_str[indent] = '\0'; for(i = 0; i < hash_table->size; i++) { printf("%s%03d = {", indent_str, i); for(current = hash_table->chains[i]; current; current = current->next) { printf("'%s'", current->key); if (current->next) { printf(", "); } } printf("}\n"); } } void run_hash_test(int in_count, char ** in_array, int table_size) { HASHTABLE * hash_table = create_hash_table(table_size); HASHTABLE * new_hash_table = NULL; HASHITEM * removed_item = NULL; int new_size = 0; int i = 0; double load_factor = 0.0; int item_count = 0; char * item_data = 0; char * test_key = "define_me"; puts(TEST_SEPARATOR); puts("TEST: inserting items:"); puts(TEST_SEPARATOR); for (i = 2; i < in_count; i++) { insert_item(hash_table, create_hash_item(in_array[i], ":D")); } print_hash_table(hash_table, INDENT_NUM); puts(TEST_SEPARATOR); puts("TEST: retrieving table statistics:"); puts(TEST_SEPARATOR); item_data = (char *)get_item_data(hash_table, test_key); item_count = get_item_count (hash_table); load_factor = get_load_factor (hash_table); printf(GETITEMDATA_F, hash_table, test_key, item_data); printf(GETITEMCOUNT_F, hash_table, item_count); printf(GETLOADFACTOR_F, hash_table, load_factor); printf(HASHTABLESIZE_F, hash_table->size); puts(TEST_SEPARATOR); puts("TEST: removing items:"); puts(TEST_SEPARATOR); printf(REMOVING_F, in_array[2]); removed_item = remove_item(hash_table, in_array[2]); print_hash_table(hash_table, INDENT_NUM); free(removed_item); puts(TEST_SEPARATOR); puts("TEST: resizing of table:"); puts(TEST_SEPARATOR); // Double the field size new_size = get_highest_prime (2 * hash_table->size); new_hash_table = do_hash_table_resize(hash_table, new_size); print_hash_table(new_hash_table, INDENT_NUM); puts(TEST_SEPARATOR); puts("TEST: new table statistics:"); puts(TEST_SEPARATOR); item_data = (char *)get_item_data(new_hash_table, test_key); item_count = get_item_count (new_hash_table); load_factor = get_load_factor (new_hash_table); printf(GETITEMDATA_F, new_hash_table, test_key, item_data); printf(GETITEMCOUNT_F, new_hash_table, item_count); printf(GETLOADFACTOR_F, new_hash_table, load_factor); printf(HASHTABLESIZE_F, new_hash_table->size); puts(TEST_SEPARATOR); puts("TEST: destroying tables"); puts(TEST_SEPARATOR); destroy_hash_table(hash_table); destroy_hash_table(new_hash_table); printf(DESTROYDONE_F); } #ifdef BUILD_EXECUTABLE int main(int argc, char ** argv) { int table_size = 0; int max_table_size = 0; char * endptr; if (argc <= 2) { printf("Error, not enough arguements supplied\n"); return 1; } max_table_size = strtol(argv[1], &endptr, INPUT_NUMBER_BASE); if (endptr == argv[1]) { printf("Invalid number '%s'\n", argv[1]); return 4; } else if (max_table_size < 3) { printf("The supplied number is too small! The max table size must be at least 3\n"); return 2; } else if (max_table_size == LONG_MAX) { printf("The supplied number is too big! The max table size should be less than %d\n", LONG_MAX); return 3; } printf("Max table size = %d\n", max_table_size); printf("Calculating largest prime less than max table size to use as real table size\n"); table_size = get_highest_prime(max_table_size); printf("Resulting table size = %d\n", table_size); run_hash_test(argc, argv, table_size); return 0; } #endif