diff options
author | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2011-10-14 04:55:05 +0200 |
---|---|---|
committer | Robin Haberkorn <robin.haberkorn@googlemail.com> | 2011-10-14 04:55:05 +0200 |
commit | 6aa0e0017d7d0cddc006da885946934b06949a91 (patch) | |
tree | 66b688ec32e2f91266db760b1762f2a50cc52036 /libslang/src/util/perfhash.c | |
parent | a966db5b71328f6adf9dd767e64b322a3bd7ed9c (diff) | |
download | erlang-slang-fork-6aa0e0017d7d0cddc006da885946934b06949a91.tar.gz |
include libslang-1.4.9 and automatically build it and link erlang-slang against it
few (erlang) people will still have libslang-1.4.9 installed or spend time
to get it to link against the driver
Diffstat (limited to 'libslang/src/util/perfhash.c')
-rw-r--r-- | libslang/src/util/perfhash.c | 600 |
1 files changed, 600 insertions, 0 deletions
diff --git a/libslang/src/util/perfhash.c b/libslang/src/util/perfhash.c new file mode 100644 index 0000000..b593b19 --- /dev/null +++ b/libslang/src/util/perfhash.c @@ -0,0 +1,600 @@ +/* Copyright (c) 1998, 2001 John E. Davis + * + * You may distribute under the terms of either the GNU General Public + * License or the Perl Artistic License. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +static char *C_Char_Map_Name = "Keyword_Hash_Table"; +static char *C_Hash_Function_Name = "keyword_hash"; +static char *C_Hash_Table_Type = "Keyword_Table_Type"; +static char *C_Hash_Table_Name = "Keyword_Table"; +static char *C_Is_Keyword_Function_Name = "is_keyword"; + +typedef struct +{ + unsigned int hash; + unsigned int len; + unsigned int freq_statistic; + char *name; + char *type; +} +String_Table_Type; + +static String_Table_Type String_Table [256]; + +static unsigned int Num_Strings; + +static unsigned int Max_Table_Value; +static unsigned int Max_String_Len; +static unsigned int Min_String_Len; +static unsigned int Max_Hash_Value; +static unsigned int Tweak_Step_Size; +static int Use_Length = 1; + +static int Char_Table_Map [256]; + +static void write_table (unsigned int min_hash, unsigned int max_hash) +{ + String_Table_Type *st, *st_max; + unsigned int i; + + fprintf (stdout, "\n\ +typedef SLCONST struct\n\ +{\n\ + char *name;\n\ + unsigned int type;\n\ +}\n\ +%s;\n\ +", + C_Hash_Table_Type); + + fprintf (stdout, "\nstatic %s %s [/* %u */] =\n{\n", + C_Hash_Table_Type, C_Hash_Table_Name, (max_hash - min_hash) + 1); + fprintf (stderr, "String Table Size: %u\n", (max_hash - min_hash) + 1); + + for (i = min_hash; i <= max_hash; i++) + { + st = String_Table; + st_max = st + Num_Strings; + while (st < st_max) + { + if ((unsigned int) st->hash == i) + break; + st++; + } + + if (st == st_max) + fprintf (stdout, " {NULL,0},\n"); + else + fprintf (stdout, " {\"%s\",\t%s},\n", st->name, st->type); + } + + fputs ("};\n\n", stdout); + + fprintf (stdout, "\ +static %s *%s (char *str, unsigned int len)\n\ +{\n\ + unsigned int hash;\n\ + char *name;\n\ + %s *kw;\n\ +\n\ + if ((len < MIN_KEYWORD_LEN)\n\ + || (len > MAX_KEYWORD_LEN))\n\ + return NULL;\n\ +\n\ + hash = %s (str, len);\n\ + if ((hash > MAX_HASH_VALUE) || (hash < MIN_HASH_VALUE))\n\ + return NULL;\n\ +\n\ + kw = &%s[hash - MIN_HASH_VALUE];\n\ + if ((NULL != (name = kw->name))\n\ + && (*str == *name)\n\ + && (0 == strcmp (str, name)))\n\ + return kw;\n\ + return NULL;\n\ +}\n\ +", + C_Hash_Table_Type, C_Is_Keyword_Function_Name, + C_Hash_Table_Type, C_Hash_Function_Name, C_Hash_Table_Name); +} + +static unsigned int hash_function (int *char_map, + unsigned char *s, unsigned int len) +{ + unsigned int sum; + + if (Use_Length) sum = len; + else sum = 0; + + while (len) + { + len--; + sum += (unsigned int) char_map [s[len]]; + } + return sum; +} + +static unsigned int Frequency_Table [256]; + +static void init_map (int *map) +{ + unsigned int i; + + for (i = 0; i < 256; i++) map [i] = -1; + + for (i = 0; i < Num_Strings; i++) + { + unsigned char *s; + + s = (unsigned char *) String_Table[i].name; + + while (*s != 0) + { + map [*s] = 0; + s++; + } + } +} + +static void write_map (int *map) +{ + unsigned int i; + unsigned int min_hash, max_hash; + char *type; + + min_hash = 0xFFFF; + max_hash = 0; + for (i = 0; i < Num_Strings; i++) + { + unsigned int h = String_Table[i].hash; + if (h < min_hash) min_hash = h; + if (h > max_hash) max_hash = h; +#if 0 + fprintf (stdout, "-->%s\t%u\n", String_Table[i].name, h); +#endif + } + + fprintf (stdout, "#ifndef SLCONST\n#define SLCONST const\n#endif\n"); + fprintf (stdout, "#define MIN_HASH_VALUE\t%u\n", min_hash); + fprintf (stdout, "#define MAX_HASH_VALUE\t%u\n", max_hash); + fprintf (stdout, "#define MIN_KEYWORD_LEN %u\n", Min_String_Len); + fprintf (stdout, "#define MAX_KEYWORD_LEN %u\n", Max_String_Len); + + for (i = 0; i < 256; i++) + if (map[i] == -1) map[i] = (max_hash + 1); + + if (max_hash + 1 < 0xFF) + type = "unsigned char"; + else if (max_hash + 1 < 0xFFFF) + type = "unsigned short"; + else + type = "unsigned long"; + + fprintf (stderr, "Hash Table is of type %s with max hash %u\n", + type, max_hash); + + fprintf (stdout, "\nstatic %s %s [256] =\n", + type, C_Char_Map_Name); + + fprintf (stdout, "{\n "); + + for (i = 0; i < 255; i++) + { + fprintf (stdout, "%3d, ", map[i]); + if ((i % 16) == 15) + fputs ("\n ", stdout); + } + fprintf (stdout, "%3d\n};\n", map[255]); + + fputs ("\n", stdout); + + fprintf (stdout, "static %s %s (char *s, unsigned int len)\n", + type, C_Hash_Function_Name); + +fprintf (stdout,"\ +{\n\ + unsigned int sum;\n\ +\n\ + sum = %s;\n\ + while (len)\n\ + {\n\ + len--;\n\ + sum += (unsigned int) %s [(unsigned char)s[len]];\n\ + }\n\ + return sum;\n\ +}\n\ +", + (Use_Length ? "len" : "0"), + C_Char_Map_Name); + + write_table (min_hash, max_hash); +} + + +static unsigned int compute_hash (unsigned int i) +{ + String_Table_Type *s; + unsigned int hash; + + s = String_Table + i; + hash = hash_function (Char_Table_Map, (unsigned char *) s->name, s->len); + return s->hash = hash; +} + + +static int tweak_character (unsigned char ch, unsigned int bad) +{ + unsigned int val; + unsigned int val_save; + unsigned int i, j; + unsigned int nvalues; + + val_save = (unsigned int) Char_Table_Map [ch]; + nvalues = Max_Table_Value; + + val = 0; + while (nvalues) + { + val += Tweak_Step_Size; + val = val % Max_Table_Value; + + Char_Table_Map[ch] = (int) val; + + for (i = 0; i <= bad; i++) + { + unsigned int hash = compute_hash (i); + for (j = 0; j < i; j++) + { + if (hash == String_Table[j].hash) + break; + } + if (j != i) + break; + } + + if (i > bad) return 0; + nvalues--; + } + + Char_Table_Map [ch] = (int) val_save; +#if 0 + /* reset hashes */ + for (i = 0; i <= bad; i++) + (void) compute_hash (i); +#endif + return -1; +} + + +static void sort_according_to_frequency (unsigned char *s, unsigned int len) +{ + /* since len is small (I hope), I will be lazy */ + unsigned char si, sj; + unsigned int fi; + unsigned int i, j, imax; + + imax = len - 1; + for (i = 0; i < imax; i++) + { + si = s[i]; + fi = Frequency_Table [si]; + + for (j = i + 1; j < len; j++) + { + sj = s[j]; + if (Frequency_Table[sj] < fi) + { + s[i] = sj; + s[j] = si; + break; + } + } + } +} + +static void create_frequency_table (unsigned int num) +{ + unsigned int i; + + memset ((char *) Frequency_Table, 0, sizeof(Frequency_Table)); + + for (i = 0; i < num; i++) + { + unsigned char *s, ch; + + s = String_Table[i].name; + while (0 != (ch = *s)) + { + Frequency_Table [ch] += 1; + s++; + } + } +} + +static int tweak_hash_function (unsigned int bad, unsigned int good) +{ + unsigned char unique_chars [256]; + unsigned char bad_chars [256], good_chars[256]; + unsigned char *s; + unsigned int i, len; + + + memset ((char *)unique_chars, 0, sizeof (unique_chars)); + memset ((char *)bad_chars, 0, sizeof (bad_chars)); + memset ((char *)good_chars, 0, sizeof (good_chars)); + + s = (unsigned char *) String_Table[bad].name; + while (*s != 0) + { + bad_chars [*s] = 1; + s++; + } + + s = (unsigned char *) String_Table[good].name; + while (*s != 0) + { + good_chars [*s] = 1; + s++; + } + + /* Find out the characters that are in good or bad, and not both. That + * way we are free to manipulate those to avoid the collision. + */ + len = 0; + for (i = 0; i < 256; i++) + { + if (bad_chars[i]) + { + if (good_chars [i] == 0) + unique_chars [len++] = i; + } + else if (good_chars [i]) + unique_chars [len++] = i; + } + + /* Unfortunately, the unique_chars may already be part of the words + * that have already been hashed. So, sort them according to how often + * they occur and deal with those that occur the least often first. + */ +#if 1 + create_frequency_table (bad); +#endif + sort_according_to_frequency (unique_chars, len); + + for (i = 0; i < len; i++) + { + unsigned char ch = unique_chars [i]; + if (0 == tweak_character (ch, bad)) + return 0; + } + + return -1; +} + +static int perfect_hash (void) +{ + unsigned int i, j; + unsigned int hash; + int has_collisions = 1; + + for (i = 0; i < Num_Strings; i++) + { + hash = compute_hash (i); + for (j = 0; j < i; j++) + { + if (hash != String_Table[j].hash) + continue; + + /* Oops. We have a collision. tweak_hash_function will + * adjust the hash table array to resolve this collision + * and ensure that previous ones remain resolved. + */ + if (-1 == tweak_hash_function (i, j)) + { + has_collisions = 1; + /* return -1; */ + } + break; + } + } + + if (has_collisions == 0) + return 0; + + has_collisions = 0; + /* Now check for collisions */ + for (i = 0; i < Num_Strings; i++) + { + char *s = String_Table[i].name; + hash = compute_hash (i); + + for (j = 0; j < i; j++) + { + if (hash != String_Table[j].hash) + continue; + + has_collisions++; + fprintf (stderr, "Collision: %s, %s\n", s, String_Table [j].name); + } + } + + if (has_collisions) + return -1; + + return 0; +} + + +static int sort_function (String_Table_Type *a, String_Table_Type *b) +{ + return b->freq_statistic - a->freq_statistic; +} + +static void sort_strings (void) +{ + unsigned int i; + void (*qsort_fun) (String_Table_Type *, unsigned int, + unsigned int, int (*)(String_Table_Type *, String_Table_Type *)); + + for (i = 0; i < Num_Strings; i++) + { + int f; + unsigned char *s; + + f = 0; + s = (unsigned char *) String_Table[i].name; + while (*s != 0) f += Frequency_Table [*s++]; + String_Table [i].freq_statistic = f; + } + + qsort_fun = (void (*) (String_Table_Type *, unsigned int, + unsigned int, + int (*)(String_Table_Type *, String_Table_Type *))) + qsort; + + qsort_fun (String_Table, Num_Strings, sizeof (String_Table_Type), + sort_function); +} + +int main (int argc, char **argv) +{ + char line[256]; + unsigned int count; + unsigned int min_char; + unsigned int max_char; + unsigned int min_len; + unsigned int max_len; + unsigned int i; + + Tweak_Step_Size = 5; + + if (isatty (0) || (argc > 2)) + { + fprintf (stderr, "Usage: %s [step-size] < keywords > hash.c\n", argv[0]); + fprintf (stderr, " Default step-size is %u\n", Tweak_Step_Size); + return 1; + } + + if (argc > 1) + { + if (1 != sscanf (argv[1], "%u", &Tweak_Step_Size)) + Tweak_Step_Size = 5; + + if ((Tweak_Step_Size % 2) == 0) Tweak_Step_Size++; + } + + count = 0; + min_char = 255; + max_char = 0; + max_len = 0; + min_len = 0xFFFF; + + while (NULL != fgets (line, sizeof (line), stdin)) + { + char *s; + unsigned int len; + char *type; + + if (*line == '%') continue; + + if (count == 256) + { + fprintf (stderr, "Only 256 keywords permitted.\n"); + return 1; + } + + len = strlen (line); + if (len && (line [len - 1] == '\n')) + { + len--; + line [len] = 0; + } + + if (len == 0) + continue; + + s = malloc (len + 1); + if (s == NULL) + { + fprintf (stderr, "Malloc error.\n"); + return 1; + } + strcpy (s, line); + String_Table[count].name = s; + + type = s; + while (*type && (*type != ' ') && (*type != '\t')) + type++; + + len = (unsigned int) (type - s); + String_Table [count].len = len; + if (len > max_len) max_len = len; + if (len < min_len) min_len = len; + + if (*type != 0) + { + *type++ = 0; + while ((*type == ' ') || (*type == '\t')) + type++; + } + + String_Table [count].type = type; + + + for (i = 0; i < len; i++) + { + unsigned char ch; + + ch = (unsigned char) s[i]; + + if (ch < min_char) + min_char = ch; + if (ch > max_char) + max_char = ch; + } + + count++; + } + + + Max_String_Len = max_len; + Min_String_Len = min_len; + + Num_Strings = count; + Max_Table_Value = 1; + while (Max_Table_Value < Num_Strings) + Max_Table_Value = Max_Table_Value << 1; + + Max_Hash_Value = Max_Table_Value * Max_String_Len; + + fprintf (stderr, "Theoretical Max_Table_Value: %u\n", Max_Table_Value); + fprintf (stderr, "Theoretical Max_Hash_Value: %u\n", Max_Hash_Value); + + create_frequency_table (Num_Strings); + sort_strings (); + + init_map (Char_Table_Map); + if (-1 == perfect_hash ()) + { + fprintf (stderr, "Hash failed.\n"); + return -1; + } + + fprintf (stderr, "Success.\n"); + + fprintf (stdout, "/* Perfect hash generated by command line:\n *"); + for (i = 0; i < argc; i++) + fprintf (stdout, " %s", argv[i]); + fputs ("\n */\n", stdout); + + write_map (Char_Table_Map); + return 0; +} + + + + + |