hashtable.c

   1 /*
   2  * Routines to provide a memory-efficient hashtable.
   3  *
   4  * Copyright (C) 2007-2020 Wayne Davison
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 3 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along
  17  * with this program; if not, visit the http://fsf.org website.
  18  */
  19
  20 #include "rsync.h"
  21
  22 #define HASH_LOAD_LIMIT(size) ((size)*3/4)
  23
  24 struct hashtable *hashtable_create(int size, int key64)
  25 {
  26         int req = size;
  27         struct hashtable *tbl;
  28         int node_size = key64 ? sizeof (struct ht_int64_node)
  29                               : sizeof (struct ht_int32_node);
  30
  31         /* Pick a power of 2 that can hold the requested size. */
  32         if (size & (size-1) || size < 16) {
  33                 size = 16;
  34                 while (size < req)
  35                         size *= 2;
  36         }
  37
  38         if (!(tbl = new(struct hashtable))
  39          || !(tbl->nodes = new_array0(char, size * node_size)))
  40                 out_of_memory("hashtable_create");
  41         tbl->size = size;
  42         tbl->entries = 0;
  43         tbl->node_size = node_size;
  44         tbl->key64 = key64 ? 1 : 0;
  45
  46         if (DEBUG_GTE(HASH, 1)) {
  47                 char buf[32];
  48                 if (req != size)
  49                         snprintf(buf, sizeof buf, "req: %d, ", req);
  50                 else
  51                         *buf = '\0';
  52                 rprintf(FINFO, "[%s] created hashtable %lx (%ssize: %d, keys: %d-bit)\n",
  53                         who_am_i(), (long)tbl, buf, size, key64 ? 64 : 32);
  54         }
  55
  56         return tbl;
  57 }
  58
  59 void hashtable_destroy(struct hashtable *tbl)
  60 {
  61         if (DEBUG_GTE(HASH, 1)) {
  62                 rprintf(FINFO, "[%s] destroyed hashtable %lx (size: %d, keys: %d-bit)\n",
  63                         who_am_i(), (long)tbl, tbl->size, tbl->key64 ? 64 : 32);
  64         }
  65         free(tbl->nodes);
  66         free(tbl);
  67 }
  68
  69 /* Returns the node that holds the indicated key if it exists. When it does not
  70  * exist, it returns either NULL (when data_when_new is NULL), or it returns a
  71  * new node with its node->data set to the indicated value.
  72  *
  73  * If your code doesn't know the data value for a new node in advance (usually
  74  * because it doesn't know if a node is new or not) you should pass in a unique
  75  * (non-0) value that you can use to check if the returned node is new. You can
  76  * then overwrite the data with any value you want (even 0) since it only needs
  77  * to be different than whatever data_when_new value you use later on.
  78  *
  79  * This return is a void* just because it might be pointing at a ht_int32_node
  80  * or a ht_int64_node, and that makes the caller's assignment a little easier. */
  81 void *hashtable_find(struct hashtable *tbl, int64 key, void *data_when_new)
  82 {
  83         int key64 = tbl->key64;
  84         struct ht_int32_node *node;
  85         uint32 ndx;
  86
  87         if (key64 ? key == 0 : (int32)key == 0) {
  88                 rprintf(FERROR, "Internal hashtable error: illegal key supplied!\n");
  89                 exit_cleanup(RERR_MESSAGEIO);
  90         }
  91
  92         if (data_when_new && tbl->entries > HASH_LOAD_LIMIT(tbl->size)) {
  93                 void *old_nodes = tbl->nodes;
  94                 int size = tbl->size * 2;
  95                 int i;
  96
  97                 if (!(tbl->nodes = new_array0(char, size * tbl->node_size)))
  98                         out_of_memory("hashtable_node");
  99                 tbl->size = size;
 100                 tbl->entries = 0;
 101
 102                 if (DEBUG_GTE(HASH, 1)) {
 103                         rprintf(FINFO, "[%s] growing hashtable %lx (size: %d, keys: %d-bit)\n",
 104                                 who_am_i(), (long)tbl, size, key64 ? 64 : 32);
 105                 }
 106
 107                 for (i = size / 2; i-- > 0; ) {
 108                         struct ht_int32_node *move_node = HT_NODE(tbl, old_nodes, i);
 109                         int64 move_key = HT_KEY(move_node, key64);
 110                         if (move_key == 0)
 111                                 continue;
 112                         if (move_node->data)
 113                                 hashtable_find(tbl, move_key, move_node->data);
 114                         else {
 115                                 node = hashtable_find(tbl, move_key, "");
 116                                 node->data = 0;
 117                         }
 118                 }
 119
 120                 free(old_nodes);
 121         }
 122
 123         if (!key64) {
 124                 /* Based on Jenkins One-at-a-time hash. */
 125                 uchar buf[4], *keyp = buf;
 126                 int i;
 127
 128                 SIVALu(buf, 0, key);
 129                 for (ndx = 0, i = 0; i < 4; i++) {
 130                         ndx += keyp[i];
 131                         ndx += (ndx << 10);
 132                         ndx ^= (ndx >> 6);
 133                 }
 134                 ndx += (ndx << 3);
 135                 ndx ^= (ndx >> 11);
 136                 ndx += (ndx << 15);
 137         } else {
 138                 /* Based on Jenkins hashword() from lookup3.c. */
 139                 uint32 a, b, c;
 140
 141                 /* Set up the internal state */
 142                 a = b = c = 0xdeadbeef + (8 << 2);
 143
 144 #define rot(x,k) (((x)<<(k)) ^ ((x)>>(32-(k))))
 145 #if SIZEOF_INT64 >= 8
 146                 b += (uint32)(key >> 32);
 147 #endif
 148                 a += (uint32)key;
 149                 c ^= b; c -= rot(b, 14);
 150                 a ^= c; a -= rot(c, 11);
 151                 b ^= a; b -= rot(a, 25);
 152                 c ^= b; c -= rot(b, 16);
 153                 a ^= c; a -= rot(c, 4);
 154                 b ^= a; b -= rot(a, 14);
 155                 c ^= b; c -= rot(b, 24);
 156 #undef rot
 157                 ndx = c;
 158         }
 159
 160         /* If it already exists, return the node.  If we're not
 161          * allocating, return NULL if the key is not found. */
 162         while (1) {
 163                 int64 nkey;
 164
 165                 ndx &= tbl->size - 1;
 166                 node = HT_NODE(tbl, tbl->nodes, ndx);
 167                 nkey = HT_KEY(node, key64);
 168
 169                 if (nkey == key)
 170                         return node;
 171                 if (nkey == 0) {
 172                         if (!data_when_new)
 173                                 return NULL;
 174                         break;
 175                 }
 176                 ndx++;
 177         }
 178
 179         /* Take over this empty spot and then return the node. */
 180         if (key64)
 181                 ((struct ht_int64_node*)node)->key = key;
 182         else
 183                 node->key = (int32)key;
 184         node->data = data_when_new;
 185         tbl->entries++;
 186         return node;
 187 }
 188
 189 #ifndef WORDS_BIGENDIAN
 190 # define HASH_LITTLE_ENDIAN 1
 191 # define HASH_BIG_ENDIAN 0
 192 #else
 193 # define HASH_LITTLE_ENDIAN 0
 194 # define HASH_BIG_ENDIAN 1
 195 #endif
 196
 197 /*
 198  -------------------------------------------------------------------------------
 199  lookup3.c, by Bob Jenkins, May 2006, Public Domain.
 200
 201  These are functions for producing 32-bit hashes for hash table lookup.
 202  hash_word(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
 203  are externally useful functions.  Routines to test the hash are included
 204  if SELF_TEST is defined.  You can use this free for any purpose.  It's in
 205  the public domain.  It has no warranty.
 206
 207  You probably want to use hashlittle().  hashlittle() and hashbig()
 208  hash byte arrays.  hashlittle() is is faster than hashbig() on
 209  little-endian machines.  Intel and AMD are little-endian machines.
 210  On second thought, you probably want hashlittle2(), which is identical to
 211  hashlittle() except it returns two 32-bit hashes for the price of one.
 212  You could implement hashbig2() if you wanted but I haven't bothered here.
 213
 214  If you want to find a hash of, say, exactly 7 integers, do
 215    a = i1;  b = i2;  c = i3;
 216    mix(a,b,c);
 217    a += i4; b += i5; c += i6;
 218    mix(a,b,c);
 219    a += i7;
 220    final(a,b,c);
 221  then use c as the hash value.  If you have a variable length array of
 222  4-byte integers to hash, use hash_word().  If you have a byte array (like
 223  a character string), use hashlittle().  If you have several byte arrays, or
 224  a mix of things, see the comments above hashlittle().
 225
 226  Why is this so big?  I read 12 bytes at a time into 3 4-byte integers,
 227  then mix those integers.  This is fast (you can do a lot more thorough
 228  mixing with 12*3 instructions on 3 integers than you can with 3 instructions
 229  on 1 byte), but shoehorning those bytes into integers efficiently is messy.
 230 */
 231
 232 #define hashsize(n) ((uint32_t)1<<(n))
 233 #define hashmask(n) (hashsize(n)-1)
 234 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
 235
 236 /*
 237  -------------------------------------------------------------------------------
 238  mix -- mix 3 32-bit values reversibly.
 239
 240  This is reversible, so any information in (a,b,c) before mix() is
 241  still in (a,b,c) after mix().
 242
 243  If four pairs of (a,b,c) inputs are run through mix(), or through
 244  mix() in reverse, there are at least 32 bits of the output that
 245  are sometimes the same for one pair and different for another pair.
 246  This was tested for:
 247  * pairs that differed by one bit, by two bits, in any combination
 248    of top bits of (a,b,c), or in any combination of bottom bits of
 249    (a,b,c).
 250  * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
 251    the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
 252    is commonly produced by subtraction) look like a single 1-bit
 253    difference.
 254  * the base values were pseudorandom, all zero but one bit set, or
 255    all zero plus a counter that starts at zero.
 256
 257  Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
 258  satisfy this are
 259      4  6  8 16 19  4
 260      9 15  3 18 27 15
 261     14  9  3  7 17  3
 262  Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
 263  for "differ" defined as + with a one-bit base and a two-bit delta.  I
 264  used http://burtleburtle.net/bob/hash/avalanche.html to choose
 265  the operations, constants, and arrangements of the variables.
 266
 267  This does not achieve avalanche.  There are input bits of (a,b,c)
 268  that fail to affect some output bits of (a,b,c), especially of a.  The
 269  most thoroughly mixed value is c, but it doesn't really even achieve
 270  avalanche in c.
 271
 272  This allows some parallelism.  Read-after-writes are good at doubling
 273  the number of bits affected, so the goal of mixing pulls in the opposite
 274  direction as the goal of parallelism.  I did what I could.  Rotates
 275  seem to cost as much as shifts on every machine I could lay my hands
 276  on, and rotates are much kinder to the top and bottom bits, so I used
 277  rotates.
 278  -------------------------------------------------------------------------------
 279 */
 280 #define mix(a,b,c) \
 281 { \
 282   a -= c;  a ^= rot(c, 4);  c += b; \
 283   b -= a;  b ^= rot(a, 6);  a += c; \
 284   c -= b;  c ^= rot(b, 8);  b += a; \
 285   a -= c;  a ^= rot(c,16);  c += b; \
 286   b -= a;  b ^= rot(a,19);  a += c; \
 287   c -= b;  c ^= rot(b, 4);  b += a; \
 288 }
 289
 290 /*
 291  -------------------------------------------------------------------------------
 292  final -- final mixing of 3 32-bit values (a,b,c) into c
 293
 294  Pairs of (a,b,c) values differing in only a few bits will usually
 295  produce values of c that look totally different.  This was tested for
 296  * pairs that differed by one bit, by two bits, in any combination
 297    of top bits of (a,b,c), or in any combination of bottom bits of
 298    (a,b,c).
 299  * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
 300    the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
 301    is commonly produced by subtraction) look like a single 1-bit
 302    difference.
 303  * the base values were pseudorandom, all zero but one bit set, or
 304    all zero plus a counter that starts at zero.
 305
 306  These constants passed:
 307   14 11 25 16 4 14 24
 308   12 14 25 16 4 14 24
 309  and these came close:
 310    4  8 15 26 3 22 24
 311   10  8 15 26 3 22 24
 312   11  8 15 26 3 22 24
 313  -------------------------------------------------------------------------------
 314 */
 315 #define final(a,b,c) \
 316 { \
 317   c ^= b; c -= rot(b,14); \
 318   a ^= c; a -= rot(c,11); \
 319   b ^= a; b -= rot(a,25); \
 320   c ^= b; c -= rot(b,16); \
 321   a ^= c; a -= rot(c,4);  \
 322   b ^= a; b -= rot(a,14); \
 323   c ^= b; c -= rot(b,24); \
 324 }
 325
 326
 327 /*
 328  -------------------------------------------------------------------------------
 329  hashlittle() -- hash a variable-length key into a 32-bit value
 330    k       : the key (the unaligned variable-length array of bytes)
 331    length  : the length of the key, counting by bytes
 332    val2    : IN: can be any 4-byte value OUT: second 32 bit hash.
 333  Returns a 32-bit value.  Every bit of the key affects every bit of
 334  the return value.  Two keys differing by one or two bits will have
 335  totally different hash values.  Note that the return value is better
 336  mixed than val2, so use that first.
 337
 338  The best hash table sizes are powers of 2.  There is no need to do
 339  mod a prime (mod is sooo slow!).  If you need less than 32 bits,
 340  use a bitmask.  For example, if you need only 10 bits, do
 341    h = (h & hashmask(10));
 342  In which case, the hash table should have hashsize(10) elements.
 343
 344  If you are hashing n strings (uint8_t **)k, do it like this:
 345    for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h);
 346
 347  By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
 348  code any way you wish, private, educational, or commercial.  It's free.
 349
 350  Use for hash table lookup, or anything where one collision in 2^^32 is
 351  acceptable.  Do NOT use for cryptographic purposes.
 352  -------------------------------------------------------------------------------
 353 */
 354
 355 uint32_t hashlittle(const void *key, size_t length)
 356 {
 357   uint32_t a,b,c;                                          /* internal state */
 358   union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */
 359
 360   /* Set up the internal state */
 361   a = b = c = 0xdeadbeef + ((uint32_t)length);
 362
 363   u.ptr = key;
 364   if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
 365     const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
 366     const uint8_t  *k8;
 367
 368     /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
 369     while (length > 12)
 370     {
 371       a += k[0];
 372       b += k[1];
 373       c += k[2];
 374       mix(a,b,c);
 375       length -= 12;
 376       k += 3;
 377     }
 378
 379     /*----------------------------- handle the last (probably partial) block */
 380     k8 = (const uint8_t *)k;
 381     switch(length)
 382     {
 383     case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
 384     case 11: c+=((uint32_t)k8[10])<<16;  /* fall through */
 385     case 10: c+=((uint32_t)k8[9])<<8;    /* fall through */
 386     case 9 : c+=k8[8];                   /* fall through */
 387     case 8 : b+=k[1]; a+=k[0]; break;
 388     case 7 : b+=((uint32_t)k8[6])<<16;   /* fall through */
 389     case 6 : b+=((uint32_t)k8[5])<<8;    /* fall through */
 390     case 5 : b+=k8[4];                   /* fall through */
 391     case 4 : a+=k[0]; break;
 392     case 3 : a+=((uint32_t)k8[2])<<16;   /* fall through */
 393     case 2 : a+=((uint32_t)k8[1])<<8;    /* fall through */
 394     case 1 : a+=k8[0]; break;
 395     case 0 : return c;
 396     }
 397   } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
 398     const uint16_t *k = (const uint16_t *)key;         /* read 16-bit chunks */
 399     const uint8_t  *k8;
 400
 401     /*--------------- all but last block: aligned reads and different mixing */
 402     while (length > 12)
 403     {
 404       a += k[0] + (((uint32_t)k[1])<<16);
 405       b += k[2] + (((uint32_t)k[3])<<16);
 406       c += k[4] + (((uint32_t)k[5])<<16);
 407       mix(a,b,c);
 408       length -= 12;
 409       k += 6;
 410     }
 411
 412     /*----------------------------- handle the last (probably partial) block */
 413     k8 = (const uint8_t *)k;
 414     switch(length)
 415     {
 416     case 12: c+=k[4]+(((uint32_t)k[5])<<16);
 417              b+=k[2]+(((uint32_t)k[3])<<16);
 418              a+=k[0]+(((uint32_t)k[1])<<16);
 419              break;
 420     case 11: c+=((uint32_t)k8[10])<<16;     /* fall through */
 421     case 10: c+=k[4];
 422              b+=k[2]+(((uint32_t)k[3])<<16);
 423              a+=k[0]+(((uint32_t)k[1])<<16);
 424              break;
 425     case 9 : c+=k8[8];                      /* fall through */
 426     case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
 427              a+=k[0]+(((uint32_t)k[1])<<16);
 428              break;
 429     case 7 : b+=((uint32_t)k8[6])<<16;      /* fall through */
 430     case 6 : b+=k[2];
 431              a+=k[0]+(((uint32_t)k[1])<<16);
 432              break;
 433     case 5 : b+=k8[4];                      /* fall through */
 434     case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
 435              break;
 436     case 3 : a+=((uint32_t)k8[2])<<16;      /* fall through */
 437     case 2 : a+=k[0];
 438              break;
 439     case 1 : a+=k8[0];
 440              break;
 441     case 0 : return c;                     /* zero length requires no mixing */
 442     }
 443
 444   } else {                        /* need to read the key one byte at a time */
 445     const uint8_t *k = (const uint8_t *)key;
 446
 447     /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
 448     while (length > 12)
 449     {
 450       a += k[0];
 451       a += ((uint32_t)k[1])<<8;
 452       a += ((uint32_t)k[2])<<16;
 453       a += ((uint32_t)k[3])<<24;
 454       b += k[4];
 455       b += ((uint32_t)k[5])<<8;
 456       b += ((uint32_t)k[6])<<16;
 457       b += ((uint32_t)k[7])<<24;
 458       c += k[8];
 459       c += ((uint32_t)k[9])<<8;
 460       c += ((uint32_t)k[10])<<16;
 461       c += ((uint32_t)k[11])<<24;
 462       mix(a,b,c);
 463       length -= 12;
 464       k += 12;
 465     }
 466
 467     /*-------------------------------- last block: affect all 32 bits of (c) */
 468     switch(length)                   /* all the case statements fall through */
 469     {
 470     case 12: c+=((uint32_t)k[11])<<24;
 471              /* FALLTHROUGH */
 472     case 11: c+=((uint32_t)k[10])<<16;
 473              /* FALLTHROUGH */
 474     case 10: c+=((uint32_t)k[9])<<8;
 475              /* FALLTHROUGH */
 476     case 9 : c+=k[8];
 477              /* FALLTHROUGH */
 478     case 8 : b+=((uint32_t)k[7])<<24;
 479              /* FALLTHROUGH */
 480     case 7 : b+=((uint32_t)k[6])<<16;
 481              /* FALLTHROUGH */
 482     case 6 : b+=((uint32_t)k[5])<<8;
 483              /* FALLTHROUGH */
 484     case 5 : b+=k[4];
 485              /* FALLTHROUGH */
 486     case 4 : a+=((uint32_t)k[3])<<24;
 487              /* FALLTHROUGH */
 488     case 3 : a+=((uint32_t)k[2])<<16;
 489              /* FALLTHROUGH */
 490     case 2 : a+=((uint32_t)k[1])<<8;
 491              /* FALLTHROUGH */
 492     case 1 : a+=k[0];
 493              break;
 494     case 0 : return c;
 495     }
 496   }
 497
 498   final(a,b,c);
 499   return c;
 500 }