md5p8.diff

   1 From: Jorrit Jongma <git@jongma.org>
   2
   3 - MD5 optimization in block matching phase:
   4
   5 MD5 hashes computed during rsync's block matching phase are independent
   6 and thus possible to process in parallel. This code processes 4 blocks
   7 in parallel if SSE2 is available, or 8 if AVX2 is available. An increase
   8 of performance (or decrease of CPU usage) of up to 6x has been measured.
   9
  10 A prefetching algorithm is used to predict and load upcoming blocks, as
  11 this prevents the need for extensive modifications to other parts of
  12 the rsync sources to get this working.
  13
  14 This remains compatible with existing rsync builds using MD5 checksums.
  15
  16 - MD5P8 whole-file checksum:
  17
  18 Splits the input up into 8 independent streams (64-byte interleave), and
  19 produces a final checksum based on the end state of those 8 streams. If
  20 parallelization of MD5 hashing is available, the performance gain (or
  21 CPU usage decrease) is 2x to 6x compared to traditional MD5.
  22
  23 The rsync version on both ends of the connection need MD5P8 support
  24 built-in for it to be used.
  25
  26 xxHash is still preferred (and faster), but this provides a reasonably
  27 fast fallback for the case where xxHash libraries are not available at
  28 build time.
  29
  30 based-on: 194cee671d5e178f20c4494f41911fa8db942935
  31 diff --git a/Makefile.in b/Makefile.in
  32 --- a/Makefile.in
  33 +++ b/Makefile.in
  34 @@ -29,14 +29,14 @@ SHELL=/bin/sh
  35  .SUFFIXES:
  36  .SUFFIXES: .c .o
  37
  38 -SIMD_x86_64=simd-checksum-x86_64.o
  39 +SIMD_x86_64=simd-checksum-x86_64.o simd-md5-parallel-x86_64.o
  40  ASM_x86_64=lib/md5-asm-x86_64.o
  41
  42  GENFILES=configure.sh aclocal.m4 config.h.in proto.h proto.h-tstamp rsync.1 rsync.1.html \
  43          rsync-ssl.1 rsync-ssl.1.html rsyncd.conf.5 rsyncd.conf.5.html
  44  HEADERS=byteorder.h config.h errcode.h proto.h rsync.h ifuncs.h itypes.h inums.h \
  45         lib/pool_alloc.h lib/mdigest.h lib/md-defines.h version.h
  46 -LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o \
  47 +LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o lib/md5p8.o \
  48         lib/permstring.o lib/pool_alloc.o lib/sysacls.o lib/sysxattrs.o @LIBOBJS@
  49  zlib_OBJS=zlib/deflate.o zlib/inffast.o zlib/inflate.o zlib/inftrees.o \
  50         zlib/trees.o zlib/zutil.o zlib/adler32.o zlib/compress.o zlib/crc32.o
  51 @@ -135,6 +135,9 @@ rounding.h: rounding.c rsync.h proto.h
  52  simd-checksum-x86_64.o: simd-checksum-x86_64.cpp
  53         @$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-checksum-x86_64.cpp
  54
  55 +simd-md5-parallel-x86_64.o: simd-md5-parallel-x86_64.cpp
  56 +       @$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-md5-parallel-x86_64.cpp
  57 +
  58  lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.S config.h lib/md-defines.h
  59         @$(srcdir)/cmdormsg disable-asm $(CC) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/lib/md5-asm-x86_64.S
  60
  61 diff --git a/checksum.c b/checksum.c
  62 --- a/checksum.c
  63 +++ b/checksum.c
  64 @@ -52,6 +52,7 @@ struct name_num_obj valid_checksums = {
  65                 { CSUM_XXH64, "xxh64", NULL },
  66                 { CSUM_XXH64, "xxhash", NULL },
  67  #endif
  68 +               { CSUM_MD5P8, "md5p8", NULL },
  69                 { CSUM_MD5, "md5", NULL },
  70                 { CSUM_MD4, "md4", NULL },
  71                 { CSUM_NONE, "none", NULL },
  72 @@ -141,6 +142,7 @@ int csum_len_for_type(int cst, BOOL flist_csum)
  73           case CSUM_MD4_OLD:
  74           case CSUM_MD4_BUSTED:
  75                 return MD4_DIGEST_LEN;
  76 +         case CSUM_MD5P8:
  77           case CSUM_MD5:
  78                 return MD5_DIGEST_LEN;
  79           case CSUM_XXH64:
  80 @@ -167,6 +169,7 @@ int canonical_checksum(int csum_type)
  81           case CSUM_MD4_BUSTED:
  82                 break;
  83           case CSUM_MD4:
  84 +         case CSUM_MD5P8:
  85           case CSUM_MD5:
  86                 return -1;
  87           case CSUM_XXH64:
  88 @@ -179,7 +182,9 @@ int canonical_checksum(int csum_type)
  89         return 0;
  90  }
  91
  92 -#ifndef HAVE_SIMD /* See simd-checksum-*.cpp. */
  93 +#ifdef HAVE_SIMD /* See simd-checksum-*.cpp. */
  94 +#define get_checksum2 get_checksum2_nosimd
  95 +#else
  96  /*
  97    a simple 32 bit checksum that can be updated from either end
  98    (inspired by Mark Adler's Adler-32 checksum)
  99 @@ -200,9 +205,18 @@ uint32 get_checksum1(char *buf1, int32 len)
 100         }
 101         return (s1 & 0xffff) + (s2 << 16);
 102  }
 103 +
 104 +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
 105 +{
 106 +}
 107 +
 108 +void checksum2_disable_prefetch()
 109 +{
 110 +}
 111  #endif
 112
 113 -void get_checksum2(char *buf, int32 len, char *sum)
 114 +/* Renamed to get_checksum2_nosimd() with HAVE_SIMD */
 115 +void get_checksum2(char *buf, int32 len, char *sum, UNUSED(OFF_T prefetch_offset))
 116  {
 117         switch (xfersum_type) {
 118  #ifdef SUPPORT_XXHASH
 119 @@ -221,6 +235,7 @@ void get_checksum2(char *buf, int32 len, char *sum)
 120                 break;
 121           }
 122  #endif
 123 +         case CSUM_MD5P8:  /* == CSUM_MD5 for checksum2 */
 124           case CSUM_MD5: {
 125                 MD5_CTX m5;
 126                 uchar seedbuf[4];
 127 @@ -373,6 +388,21 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
 128                 break;
 129           }
 130  #endif
 131 +         case CSUM_MD5P8: {
 132 +               MD5P8_CTX m5p8;
 133 +
 134 +               MD5P8_Init(&m5p8);
 135 +
 136 +               for (i = 0; i + CHUNK_SIZE <= len; i += CHUNK_SIZE)
 137 +                       MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, CHUNK_SIZE), CHUNK_SIZE);
 138 +
 139 +               remainder = (int32)(len - i);
 140 +               if (remainder > 0)
 141 +                       MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, remainder), remainder);
 142 +
 143 +               MD5P8_Final((uchar *)sum, &m5p8);
 144 +               break;
 145 +         }
 146           case CSUM_MD5: {
 147                 MD5_CTX m5;
 148
 149 @@ -445,6 +475,7 @@ static union {
 150  #endif
 151         MD5_CTX m5;
 152  } ctx;
 153 +static MD5P8_CTX m5p8;
 154  #ifdef SUPPORT_XXHASH
 155  static XXH64_state_t* xxh64_state;
 156  #endif
 157 @@ -481,6 +512,9 @@ void sum_init(int csum_type, int seed)
 158                 XXH3_128bits_reset(xxh3_state);
 159                 break;
 160  #endif
 161 +         case CSUM_MD5P8:
 162 +               MD5P8_Init(&m5p8);
 163 +               break;
 164           case CSUM_MD5:
 165                 MD5_Init(&ctx.m5);
 166                 break;
 167 @@ -531,6 +565,9 @@ void sum_update(const char *p, int32 len)
 168                 XXH3_128bits_update(xxh3_state, p, len);
 169                 break;
 170  #endif
 171 +         case CSUM_MD5P8:
 172 +               MD5P8_Update(&m5p8, (uchar *)p, len);
 173 +               break;
 174           case CSUM_MD5:
 175                 MD5_Update(&ctx.m5, (uchar *)p, len);
 176                 break;
 177 @@ -596,6 +633,9 @@ int sum_end(char *sum)
 178                 break;
 179           }
 180  #endif
 181 +         case CSUM_MD5P8:
 182 +               MD5P8_Final((uchar *)sum, &m5p8);
 183 +               break;
 184           case CSUM_MD5:
 185                 MD5_Final((uchar *)sum, &ctx.m5);
 186                 break;
 187 diff --git a/generator.c b/generator.c
 188 --- a/generator.c
 189 +++ b/generator.c
 190 @@ -708,10 +708,12 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
 191         if (append_mode > 0 && f_copy < 0)
 192                 return 0;
 193
 194 -       if (len > 0)
 195 +       if (len > 0) {
 196                 mapbuf = map_file(fd, len, MAX_MAP_SIZE, sum.blength);
 197 -       else
 198 +               checksum2_enable_prefetch(mapbuf, len, sum.blength);
 199 +       } else {
 200                 mapbuf = NULL;
 201 +       }
 202
 203         for (i = 0; i < sum.count; i++) {
 204                 int32 n1 = (int32)MIN(len, (OFF_T)sum.blength);
 205 @@ -729,7 +731,7 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
 206                 }
 207
 208                 sum1 = get_checksum1(map, n1);
 209 -               get_checksum2(map, n1, sum2);
 210 +               get_checksum2(map, n1, sum2, offset - n1);
 211
 212                 if (DEBUG_GTE(DELTASUM, 3)) {
 213                         rprintf(FINFO,
 214 @@ -741,8 +743,10 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
 215                 write_buf(f_out, sum2, sum.s2length);
 216         }
 217
 218 -       if (mapbuf)
 219 +       if (mapbuf) {
 220                 unmap_file(mapbuf);
 221 +               checksum2_disable_prefetch();
 222 +       }
 223
 224         return 0;
 225  }
 226 diff --git a/lib/md-defines.h b/lib/md-defines.h
 227 --- a/lib/md-defines.h
 228 +++ b/lib/md-defines.h
 229 @@ -15,3 +15,4 @@
 230  #define CSUM_XXH64 6
 231  #define CSUM_XXH3_64 7
 232  #define CSUM_XXH3_128 8
 233 +#define CSUM_MD5P8 9
 234 diff --git a/lib/md5p8.c b/lib/md5p8.c
 235 new file mode 100644
 236 --- /dev/null
 237 +++ b/lib/md5p8.c
 238 @@ -0,0 +1,128 @@
 239 +/*
 240 + * MD5-based hash friendly to parallel processing, reference implementation
 241 + *
 242 + * Author: Jorrit Jongma, 2020
 243 + *
 244 + * Released in the public domain falling back to the MIT license
 245 + * ( http://www.opensource.org/licenses/MIT ) in case public domain does not
 246 + * apply in your country.
 247 + */
 248 +/*
 249 + * MD5P8 is an MD5-based hash friendly to parallel processing. The input
 250 + * stream is divided into 8 independent streams. For each 512 bytes of input,
 251 + * the first 64 bytes are send to the first stream, the second 64 bytes to
 252 + * the second stream, etc. The input stream is padded with zeros to the next
 253 + * multiple of 512 bytes, then a normal MD5 hash is computed on a buffer
 254 + * containing the A, B, C, and D states of the 8 individual streams, followed
 255 + * by the (unpadded) length of the input.
 256 + *
 257 + * On non-SIMD accelerated CPUs the performance of MD5P8 is slightly lower
 258 + * than normal MD5 (particularly on files smaller than 10 kB), but with
 259 + * SIMD-based parallel processing it can be two to six times as fast. Even in
 260 + * the best-case scenario, xxHash is still at least twice as fast and should
 261 + * be preferred when available.
 262 + */
 263 +
 264 +#include "rsync.h"
 265 +
 266 +#ifdef HAVE_SIMD
 267 +#define MD5P8_Init MD5P8_Init_c
 268 +#define MD5P8_Update MD5P8_Update_c
 269 +#define MD5P8_Final MD5P8_Final_c
 270 +#endif
 271 +
 272 +/* each MD5_CTX needs to be 8-byte aligned */
 273 +#define MD5P8_Contexts_c(ctx, index) ((MD5_CTX*)((((uintptr_t)((ctx)->context_storage) + 7) & ~7) + (index)*((sizeof(MD5_CTX) + 7) & ~7)))
 274 +
 275 +void MD5P8_Init(MD5P8_CTX *ctx)
 276 +{
 277 +    int i;
 278 +    for (i = 0; i < 8; i++) {
 279 +        MD5_Init(MD5P8_Contexts_c(ctx, i));
 280 +    }
 281 +    ctx->used = 0;
 282 +    ctx->next = 0;
 283 +}
 284 +
 285 +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
 286 +{
 287 +    uint32 pos = 0;
 288 +
 289 +    if ((ctx->used) || (length < 64)) {
 290 +        int cpy = MIN(length, 64 - ctx->used);
 291 +        memmove(&ctx->buffer[ctx->used], input, cpy);
 292 +        ctx->used += cpy;
 293 +        length -= cpy;
 294 +        pos += cpy;
 295 +
 296 +        if (ctx->used == 64) {
 297 +            MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), ctx->buffer, 64);
 298 +            ctx->used = 0;
 299 +            ctx->next = (ctx->next + 1) % 8;
 300 +        }
 301 +    }
 302 +
 303 +    while (length >= 64) {
 304 +        MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), &input[pos], 64);
 305 +        ctx->next = (ctx->next + 1) % 8;
 306 +        pos += 64;
 307 +        length -= 64;
 308 +    }
 309 +
 310 +    if (length) {
 311 +        memcpy(ctx->buffer, &input[pos], length);
 312 +        ctx->used = length;
 313 +    }
 314 +}
 315 +
 316 +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
 317 +{
 318 +    int i;
 319 +    uint32 low = 0, high = 0, sub = ctx->used ? 64 - ctx->used : 0;
 320 +    if (ctx->used) {
 321 +        uchar tmp[64];
 322 +        memset(tmp, 0, 64);
 323 +        MD5P8_Update(ctx, tmp, 64 - ctx->used);
 324 +    }
 325 +    memset(ctx->buffer, 0, 64);
 326 +    while (ctx->next != 0) {
 327 +        MD5P8_Update(ctx, ctx->buffer, 64);
 328 +        sub += 64;
 329 +    }
 330 +
 331 +    uchar state[34*4] = {0};
 332 +
 333 +    for (i = 0; i < 8; i++) {
 334 +        MD5_CTX* md = MD5P8_Contexts_c(ctx, i);
 335 +#ifdef USE_OPENSSL
 336 +        if (low + md->Nl < low) high++;
 337 +        low += md->Nl;
 338 +        high += md->Nh;
 339 +#else
 340 +        if (low + md->totalN < low) high++;
 341 +        low += md->totalN;
 342 +        high += md->totalN2;
 343 +#endif
 344 +        SIVALu(state, i*16, md->A);
 345 +        SIVALu(state, i*16 + 4, md->B);
 346 +        SIVALu(state, i*16 + 8, md->C);
 347 +        SIVALu(state, i*16 + 12, md->D);
 348 +    }
 349 +
 350 +#ifndef USE_OPENSSL
 351 +       high = (low >> 29) | (high << 3);
 352 +       low = (low << 3);
 353 +#endif
 354 +
 355 +    sub <<= 3;
 356 +    if (low - sub > low) high--;
 357 +    low -= sub;
 358 +
 359 +    SIVALu(state, 32*4, low);
 360 +    SIVALu(state, 33*4, high);
 361 +
 362 +    MD5_CTX md;
 363 +    MD5_Init(&md);
 364 +    MD5_Update(&md, state, 34*4);
 365 +    MD5_Final(digest, &md);
 366 +}
 367 diff --git a/lib/mdigest.h b/lib/mdigest.h
 368 --- a/lib/mdigest.h
 369 +++ b/lib/mdigest.h
 370 @@ -27,3 +27,14 @@ void md5_begin(md_context *ctx);
 371  void md5_update(md_context *ctx, const uchar *input, uint32 length);
 372  void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN]);
 373  #endif
 374 +
 375 +typedef struct {
 376 +    uchar context_storage[1024];
 377 +    uchar buffer[512];
 378 +    unsigned int used;
 379 +    unsigned int next;
 380 +} MD5P8_CTX;
 381 +
 382 +void MD5P8_Init(MD5P8_CTX *ctx);
 383 +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length);
 384 +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
 385 diff --git a/match.c b/match.c
 386 --- a/match.c
 387 +++ b/match.c
 388 @@ -164,6 +164,8 @@ static void hash_search(int f,struct sum_struct *s,
 389         if (DEBUG_GTE(DELTASUM, 3))
 390                 rprintf(FINFO, "sum=%.8x k=%ld\n", sum, (long)k);
 391
 392 +       checksum2_enable_prefetch(buf, len, s->blength);
 393 +
 394         offset = aligned_offset = aligned_i = 0;
 395
 396         end = len + 1 - s->sums[s->count-1].len;
 397 @@ -226,7 +228,7 @@ static void hash_search(int f,struct sum_struct *s,
 398
 399                         if (!done_csum2) {
 400                                 map = (schar *)map_ptr(buf,offset,l);
 401 -                               get_checksum2((char *)map,l,sum2);
 402 +                               get_checksum2((char *)map, l, sum2, offset);
 403                                 done_csum2 = 1;
 404                         }
 405
 406 @@ -268,7 +270,7 @@ static void hash_search(int f,struct sum_struct *s,
 407                                                 sum = get_checksum1((char *)map, l);
 408                                                 if (sum != s->sums[i].sum1)
 409                                                         goto check_want_i;
 410 -                                               get_checksum2((char *)map, l, sum2);
 411 +                                               get_checksum2((char *)map, l, sum2, aligned_offset);
 412                                                 if (memcmp(sum2, s->sums[i].sum2, s->s2length) != 0)
 413                                                         goto check_want_i;
 414                                                 /* OK, we have a re-alignment match.  Bump the offset
 415 @@ -335,6 +337,8 @@ static void hash_search(int f,struct sum_struct *s,
 416                         matched(f, s, buf, offset - s->blength, -2);
 417         } while (++offset < end);
 418
 419 +       checksum2_disable_prefetch();
 420 +
 421         matched(f, s, buf, len, -1);
 422         map_ptr(buf, len-1, 1);
 423  }
 424 diff --git a/simd-checksum-x86_64.cpp b/simd-checksum-x86_64.cpp
 425 --- a/simd-checksum-x86_64.cpp
 426 +++ b/simd-checksum-x86_64.cpp
 427 @@ -49,13 +49,33 @@
 428   * use of the target attribute, selecting the fastest code path based on
 429   * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
 430   * GCC 4.x are not supported to ease configure.ac logic.
 431 + *
 432 + * ----
 433 + *
 434 + * get_checksum2() is optimized for the case where the selected transfer
 435 + * checksum is MD5. MD5 can't be made significantly faster with SIMD
 436 + * instructions than the assembly version already included but SIMD
 437 + * instructions can be used to hash multiple streams in parallel (see
 438 + * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
 439 + * block-matching algorithm hashes the blocks independently (in contrast to
 440 + * the whole-file checksum) this method can be employed here.
 441 + *
 442 + * To prevent needing to modify the core rsync sources significantly, a
 443 + * prefetching strategy is used. When a checksum2 is requested, the code
 444 + * reads ahead several blocks, creates the MD5 hashes for each block in
 445 + * parallel, returns the hash for the first block, and caches the results
 446 + * for the other blocks to return in future calls to get_checksum2().
 447   */
 448
 449  #ifdef __x86_64__
 450  #ifdef __cplusplus
 451
 452 +extern "C" {
 453 +
 454  #include "rsync.h"
 455
 456 +}
 457 +
 458  #ifdef HAVE_SIMD
 459
 460  #include <immintrin.h>
 461 @@ -480,9 +500,235 @@ uint32 get_checksum1(char *buf1, int32 len)
 462      return get_checksum1_cpp(buf1, len);
 463  }
 464
 465 -} // extern "C"
 466 +#if !defined(BENCHMARK_SIMD_CHECKSUM1)
 467
 468 -#ifdef BENCHMARK_SIMD_CHECKSUM1
 469 +// see simd-md5-parallel-x86_64.cpp
 470 +extern int md5_parallel_slots();
 471 +extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
 472 +
 473 +#endif /* !BENCHMARK_SIMD_CHECKSUM1 */
 474 +
 475 +#if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
 476 +
 477 +#define PREFETCH_ENABLE 1 // debugging
 478 +
 479 +#if 0 // debugging
 480 +#define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
 481 +#else
 482 +#define PREFETCH_PRINTF(f_, ...) (void)0;
 483 +#endif
 484 +
 485 +#define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
 486 +#define PREFETCH_MAX_BLOCKS 8
 487 +
 488 +typedef struct {
 489 +    int in_use;
 490 +    OFF_T offset;
 491 +    int32 len;
 492 +    char sum[SUM_LENGTH];
 493 +} prefetch_sum_t;
 494 +
 495 +typedef struct {
 496 +    struct map_struct *map;
 497 +    OFF_T len;
 498 +    OFF_T last;
 499 +    int32 blocklen;
 500 +    int blocks;
 501 +    prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
 502 +} prefetch_t;
 503 +
 504 +prefetch_t *prefetch;
 505 +
 506 +extern int xfersum_type;
 507 +extern int checksum_seed;
 508 +extern int proper_seed_order;
 509 +extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
 510 +
 511 +extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
 512 +
 513 +void checksum2_disable_prefetch()
 514 +{
 515 +    if (prefetch) {
 516 +        PREFETCH_PRINTF("checksum2_disable_prefetch\n");
 517 +        free(prefetch);
 518 +        prefetch = NULL;
 519 +    }
 520 +}
 521 +
 522 +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
 523 +{
 524 +#ifdef PREFETCH_ENABLE
 525 +    checksum2_disable_prefetch();
 526 +    int slots = md5_parallel_slots();
 527 +    if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
 528 +        prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
 529 +        memset(prefetch, 0, sizeof(prefetch_t));
 530 +        prefetch->map = map;
 531 +        prefetch->len = len;
 532 +        prefetch->last = 0;
 533 +        prefetch->blocklen = blocklen;
 534 +        prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
 535 +        PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
 536 +    }
 537 +#endif
 538 +}
 539 +
 540 +static inline void checksum2_reset_prefetch()
 541 +{
 542 +    for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
 543 +        prefetch->sums[i].in_use = 0;
 544 +    }
 545 +}
 546 +
 547 +static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
 548 +{
 549 +    if (prefetch->sums[0].in_use) {
 550 +        if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
 551 +            memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
 552 +            for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
 553 +                prefetch->sums[i] = prefetch->sums[i + 1];
 554 +            }
 555 +            prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
 556 +            PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
 557 +            return 1;
 558 +        } else {
 559 +            // unexpected access, reset cache
 560 +            PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
 561 +            checksum2_reset_prefetch();
 562 +        }
 563 +    }
 564 +    return 0;
 565 +}
 566 +
 567 +static int checksum2_perform_prefetch(OFF_T prefetch_offset)
 568 +{
 569 +    int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
 570 +    if (blocks < 2) return 0; // fall through to non-simd, probably faster
 571 +
 572 +    int32 total = 0;
 573 +    int i;
 574 +    for (i = 0; i < blocks; i++) {
 575 +        prefetch->sums[i].offset = prefetch_offset + total;
 576 +        prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
 577 +        prefetch->sums[i].in_use = 0;
 578 +        total += prefetch->sums[i].len;
 579 +    }
 580 +    for (; i < PREFETCH_MAX_BLOCKS; i++) {
 581 +        prefetch->sums[i].in_use = 0;
 582 +    }
 583 +
 584 +    uchar seedbuf[4];
 585 +    SIVALu(seedbuf, 0, checksum_seed);
 586 +
 587 +    PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
 588 +    char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
 589 +    char* bufs[PREFETCH_MAX_BLOCKS] = {0};
 590 +    int lens[PREFETCH_MAX_BLOCKS] = {0};
 591 +    char* sums[PREFETCH_MAX_BLOCKS] = {0};
 592 +    for (i = 0; i < blocks; i++) {
 593 +        bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
 594 +        lens[i] = prefetch->sums[i].len;
 595 +        sums[i] = prefetch->sums[i].sum;
 596 +    }
 597 +    if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
 598 +        for (i = 0; i < blocks; i++) {
 599 +            prefetch->sums[i].in_use = 1;
 600 +        }
 601 +        return 1;
 602 +    } else {
 603 +        // this should never be, abort
 604 +        PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
 605 +        checksum2_disable_prefetch();
 606 +    }
 607 +    return 0;
 608 +}
 609 +
 610 +void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
 611 +{
 612 +    if (prefetch) {
 613 +        PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
 614 +        OFF_T last = prefetch->last;
 615 +        prefetch->last = prefetch_offset;
 616 +        if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
 617 +            // we're looking around trying to align blocks, prefetching will slow things down
 618 +            PREFETCH_PRINTF("get_checksum2 SEEK\n");
 619 +            checksum2_reset_prefetch();
 620 +        } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
 621 +            // hit
 622 +            return;
 623 +        } else if (checksum2_perform_prefetch(prefetch_offset)) {
 624 +            if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
 625 +                // hit; should always be as we just fetched this data
 626 +                return;
 627 +            } else {
 628 +                // this should never be, abort
 629 +                PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
 630 +                checksum2_disable_prefetch();
 631 +            }
 632 +        }
 633 +    }
 634 +    get_checksum2_nosimd(buf, len, sum, prefetch_offset);
 635 +}
 636 +#endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
 637 +
 638 +} // "C"
 639 +
 640 +/* Benchmark compilation
 641 +
 642 +  The get_checksum1() benchmark runs through all available code paths in a
 643 +  single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
 644 +  recompiled for each code path (it always uses the fastest path available
 645 +  on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
 646 +  be used when applicable regardless of rsync being built with OpenSSL.
 647 +
 648 +  Something like the following should compile and run the benchmarks:
 649 +
 650 +  # if gcc
 651 +  export CC=gcc
 652 +  export CXX=g++
 653 +  export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
 654 +
 655 +  # else if clang
 656 +  export CC=clang
 657 +  export CXX=clang++
 658 +  export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
 659 +
 660 +  # /if
 661 +
 662 +  export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
 663 +  export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
 664 +  export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
 665 +  export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
 666 +
 667 +  rm bench_csum*
 668 +
 669 +  ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
 670 +
 671 +  $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
 672 +
 673 +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
 674 +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
 675 +
 676 +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
 677 +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
 678 +
 679 +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
 680 +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
 681 +
 682 +  ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
 683 +
 684 +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
 685 +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
 686 +
 687 +  ./bench_csum1.all
 688 +  ./bench_csum2.asm
 689 +  ./bench_csum2.openssl
 690 +  ./bench_csum2.sse2
 691 +  ./bench_csum2.avx2
 692 +
 693 + */
 694 +
 695 +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
 696  #pragma clang optimize off
 697  #pragma GCC push_options
 698  #pragma GCC optimize ("O0")
 699 @@ -493,7 +739,9 @@ uint32 get_checksum1(char *buf1, int32 len)
 700  #ifndef CLOCK_MONOTONIC_RAW
 701  #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
 702  #endif
 703 +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
 704
 705 +#ifdef BENCHMARK_SIMD_CHECKSUM1
 706  static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
 707      struct timespec start, end;
 708      uint64_t us;
 709 @@ -509,7 +757,7 @@ static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int
 710      clock_gettime(CLOCK_MONOTONIC_RAW, &end);
 711      us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
 712      cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
 713 -    printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
 714 +    printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
 715  }
 716
 717  static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
 718 @@ -533,10 +781,108 @@ int main() {
 719      free(buf);
 720      return 0;
 721  }
 722 +#endif /* BENCHMARK_SIMD_CHECKSUM1 */
 723
 724 +#ifdef BENCHMARK_SIMD_CHECKSUM2
 725 +static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
 726 +    struct timespec start, end;
 727 +    uint64_t us;
 728 +    unsigned char cs1[16];
 729 +    unsigned char cs2[16];
 730 +    int i;
 731 +
 732 +    clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 733 +    for (i = 0; i < ROUNDS; i++) {
 734 +        func(buf, len, (char*)cs1);
 735 +    }
 736 +    clock_gettime(CLOCK_MONOTONIC_RAW, &end);
 737 +    us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
 738 +
 739 +    func2(buf, len, (char*)cs2);
 740 +
 741 +    float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
 742 +    printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
 743 +    for (i = 0; i < 16; i++) {
 744 +        printf("%02x", cs1[i] & 0xFF);
 745 +    }
 746 +    printf(" :: ");
 747 +    for (i = 0; i < 16; i++) {
 748 +        printf("%02x", cs2[i] & 0xFF);
 749 +    }
 750 +    printf("\n");
 751 +}
 752 +
 753 +static void benchmark_inner(char* buf, int32 len, char* sum_out) {
 754 +    // This should produce the same output for different optimizations
 755 +    // levels, not the same as sanity_check()
 756 +
 757 +    char* bufs[8] = {0};
 758 +    int lens[8] = {0};
 759 +    char* sums[8] = {0};
 760 +
 761 +    bufs[0] = buf;
 762 +    lens[0] = len;
 763 +    sums[0] = sum_out;
 764 +    md5_parallel(1, bufs, lens, sums, NULL, NULL);
 765 +}
 766 +
 767 +extern "C" {
 768 +extern void MD5P8_Init_c(MD5P8_CTX *ctx);
 769 +extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
 770 +extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
 771 +}
 772 +
 773 +static void sanity_check(char* buf, int32 len, char* sum_out) {
 774 +    // This should produce the same output for different optimizations
 775 +    // levels, not the same as benchmark_inner()
 776 +    if (md5_parallel_slots() <= 1) {
 777 +        MD5P8_CTX m5p8;
 778 +        MD5P8_Init_c(&m5p8);
 779 +        MD5P8_Update_c(&m5p8, (uchar *)buf, len);
 780 +        MD5P8_Final_c((uchar *)sum_out, &m5p8);
 781 +    } else {
 782 +        MD5P8_CTX m5p8;
 783 +        MD5P8_Init(&m5p8);
 784 +        MD5P8_Update(&m5p8, (uchar *)buf, len);
 785 +        MD5P8_Final((uchar *)sum_out, &m5p8);
 786 +    }
 787 +}
 788 +
 789 +int main() {
 790 +    // This benchmarks the parallel MD5 checksum rather than get_checksum2()
 791 +    // as the latter would require compiling in a lot of rsync's code, but
 792 +    // it touches all the same internals so the performance should be nearly
 793 +    // identical.
 794 +
 795 +    int i;
 796 +    char* buf = (char*)malloc(BLOCK_LEN);
 797 +    for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
 798 +
 799 +    const char* method = "?";
 800 +    switch (md5_parallel_slots()) {
 801 +        case 8: method = "AVX2"; break;
 802 +        case 4: method = "SSE2"; break;
 803 +#ifdef USE_OPENSSL
 804 +        case 1: method = "OpenSSL"; break;
 805 +#elif (CSUM_CHUNK == 64)
 806 +        case 1: method = "ASM"; break;
 807 +#else
 808 +        // this won't happen unless you modified code somewhere
 809 +        case 1: method = "Raw-C"; break;
 810 +#endif
 811 +    }
 812 +
 813 +    benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
 814 +
 815 +    free(buf);
 816 +    return 0;
 817 +}
 818 +#endif /* BENCHMARK_SIMD_CHECKSUM2 */
 819 +
 820 +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
 821  #pragma GCC pop_options
 822  #pragma clang optimize on
 823 -#endif /* BENCHMARK_SIMD_CHECKSUM1 */
 824 +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
 825
 826  #endif /* HAVE_SIMD */
 827  #endif /* __cplusplus */
 828 diff --git a/simd-md5-parallel-x86_64.cpp b/simd-md5-parallel-x86_64.cpp
 829 new file mode 100644
 830 --- /dev/null
 831 +++ b/simd-md5-parallel-x86_64.cpp
 832 @@ -0,0 +1,1138 @@
 833 +/*
 834 + * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel.
 835 + *
 836 + * Original author: Nicolas Noble, 2017
 837 + * Modifications:   Jorrit Jongma, 2020
 838 + *
 839 + * The original code was released in the public domain by the original author,
 840 + * falling back to the MIT license ( http://www.opensource.org/licenses/MIT )
 841 + * in case public domain does not apply in your country. These modifications
 842 + * are likewise released in the public domain, with the same MIT license
 843 + * fallback.
 844 + *
 845 + * The original publication can be found at:
 846 + *
 847 + * https://github.com/nicolasnoble/sse-hash
 848 + */
 849 +/*
 850 + * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
 851 + * MD5 code has been removed and those code paths rerouted to use the MD5
 852 + * code already present in rsync, and wrapper functions have been added. The
 853 + * MD5P8 code is also new, and is the reason for the new stride parameter.
 854 + *
 855 + * This code allows multiple independent MD5 streams to be processed in
 856 + * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
 857 + * lower than that of the original C routines for MD5, the processing of
 858 + * additional streams is "for free".
 859 + *
 860 + * Single streams are rerouted to rsync's normal MD5 code as that is faster
 861 + * for that case. A further optimization is possible by using SSE2 code on
 862 + * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not
 863 + * implemented here as it would require some restructuring, and in practise
 864 + * the code here is only rarely called with less than the maximum amount of
 865 + * streams (typically once at the end of each checksum2'd file).
 866 + *
 867 + * Benchmarks (in MB/s)            C     ASM  SSE2*1  SSE2*4  AVX2*1  AVX2*8
 868 + * - Intel Atom D2700            302     334     166     664     N/A     N/A
 869 + * - Intel i7-7700hq             351     376     289    1156     273    2184
 870 + * - AMD ThreadRipper 2950x      728     784     568    2272     430    3440
 871 + */
 872 +
 873 +#ifdef __x86_64__
 874 +#ifdef __cplusplus
 875 +
 876 +extern "C" {
 877 +
 878 +#include "rsync.h"
 879 +
 880 +}
 881 +
 882 +#ifdef HAVE_SIMD
 883 +
 884 +#ifndef BENCHMARK_SIMD_CHECKSUM2
 885 +#define PMD5_ALLOW_SSE2 // debugging
 886 +#define PMD5_ALLOW_AVX2 // debugging
 887 +#endif
 888 +
 889 +#ifdef PMD5_ALLOW_AVX2
 890 +#ifndef PMD5_ALLOW_SSE2
 891 +#define PMD5_ALLOW_SSE2
 892 +#endif
 893 +#endif
 894 +
 895 +#include <stdint.h>
 896 +#include <string.h>
 897 +
 898 +#include <immintrin.h>
 899 +
 900 +/* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
 901 +#ifdef __clang__
 902 +#define MVSTATIC
 903 +#else
 904 +#define MVSTATIC static
 905 +#endif
 906 +
 907 +// Missing from the headers on gcc 6 and older, clang 8 and older
 908 +typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
 909 +typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
 910 +
 911 +#define PMD5_SLOTS_DEFAULT 0
 912 +#define PMD5_SLOTS_SSE2 4
 913 +#define PMD5_SLOTS_AVX2 8
 914 +#define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2
 915 +
 916 +#ifdef PMD5_ALLOW_SSE2
 917 +__attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots()
 918 +{
 919 +    return PMD5_SLOTS_SSE2;
 920 +}
 921 +#endif
 922 +
 923 +#ifdef PMD5_ALLOW_AVX2
 924 +__attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots()
 925 +{
 926 +    return PMD5_SLOTS_AVX2;
 927 +}
 928 +#endif
 929 +
 930 +__attribute__ ((target("default"))) MVSTATIC int pmd5_slots()
 931 +{
 932 +    return PMD5_SLOTS_DEFAULT;
 933 +}
 934 +
 935 +/* The parallel MD5 context structure. */
 936 +typedef struct {
 937 +    __m128i state_sse2[4];
 938 +    __m256i state_avx2[4];
 939 +    uint64_t len[PMD5_SLOTS_MAX];
 940 +} pmd5_context;
 941 +
 942 +/* The status returned by the various functions below. */
 943 +typedef enum {
 944 +    PMD5_SUCCESS,
 945 +    PMD5_INVALID_SLOT,
 946 +    PMD5_UNALIGNED_UPDATE,
 947 +} pmd5_status;
 948 +
 949 +/* Initializes all slots in the given pmd5 context. */
 950 +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx);
 951 +
 952 +/* Initializes a single slot out in the given pmd5 context. */
 953 +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot);
 954 +
 955 +/* Makes an MD5 update on all slots in parallel, given the same exact length on all streams.
 956 +   The stream pointers will be incremented accordingly.
 957 +   It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
 958 +   The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted.
 959 +   Stride defaults to 64 if 0 is passed. */
 960 +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride);
 961 +
 962 +/* Makes an MD5 update on all slots in parallel, given different lengths.
 963 +   The stream pointers will be incremented accordingly.
 964 +   The lengths will be decreased accordingly. Not all data might be consumed.
 965 +   It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
 966 +   The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */
 967 +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]);
 968 +
 969 +/* Finishes all slots at once. Fills in all digests. */
 970 +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]);
 971 +
 972 +/* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
 973 +   a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
 974 +static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot);
 975 +
 976 +/* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a
 977 +   multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using
 978 +   a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
 979 +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length);
 980 +
 981 +/* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */
 982 +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot);
 983 +
 984 +/* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */
 985 +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot);
 986 +
 987 +#define S11  7
 988 +#define S12 12
 989 +#define S13 17
 990 +#define S14 22
 991 +#define S21  5
 992 +#define S22  9
 993 +#define S23 14
 994 +#define S24 20
 995 +#define S31  4
 996 +#define S32 11
 997 +#define S33 16
 998 +#define S34 23
 999 +#define S41  6
1000 +#define S42 10
1001 +#define S43 15
1002 +#define S44 21
1003 +
1004 +#define T1  0xD76AA478
1005 +#define T2  0xE8C7B756
1006 +#define T3  0x242070DB
1007 +#define T4  0xC1BDCEEE
1008 +#define T5  0xF57C0FAF
1009 +#define T6  0x4787C62A
1010 +#define T7  0xA8304613
1011 +#define T8  0xFD469501
1012 +#define T9  0x698098D8
1013 +#define T10 0x8B44F7AF
1014 +#define T11 0xFFFF5BB1
1015 +#define T12 0x895CD7BE
1016 +#define T13 0x6B901122
1017 +#define T14 0xFD987193
1018 +#define T15 0xA679438E
1019 +#define T16 0x49B40821
1020 +#define T17 0xF61E2562
1021 +#define T18 0xC040B340
1022 +#define T19 0x265E5A51
1023 +#define T20 0xE9B6C7AA
1024 +#define T21 0xD62F105D
1025 +#define T22 0x02441453
1026 +#define T23 0xD8A1E681
1027 +#define T24 0xE7D3FBC8
1028 +#define T25 0x21E1CDE6
1029 +#define T26 0xC33707D6
1030 +#define T27 0xF4D50D87
1031 +#define T28 0x455A14ED
1032 +#define T29 0xA9E3E905
1033 +#define T30 0xFCEFA3F8
1034 +#define T31 0x676F02D9
1035 +#define T32 0x8D2A4C8A
1036 +#define T33 0xFFFA3942
1037 +#define T34 0x8771F681
1038 +#define T35 0x6D9D6122
1039 +#define T36 0xFDE5380C
1040 +#define T37 0xA4BEEA44
1041 +#define T38 0x4BDECFA9
1042 +#define T39 0xF6BB4B60
1043 +#define T40 0xBEBFBC70
1044 +#define T41 0x289B7EC6
1045 +#define T42 0xEAA127FA
1046 +#define T43 0xD4EF3085
1047 +#define T44 0x04881D05
1048 +#define T45 0xD9D4D039
1049 +#define T46 0xE6DB99E5
1050 +#define T47 0x1FA27CF8
1051 +#define T48 0xC4AC5665
1052 +#define T49 0xF4292244
1053 +#define T50 0x432AFF97
1054 +#define T51 0xAB9423A7
1055 +#define T52 0xFC93A039
1056 +#define T53 0x655B59C3
1057 +#define T54 0x8F0CCC92
1058 +#define T55 0xFFEFF47D
1059 +#define T56 0x85845DD1
1060 +#define T57 0x6FA87E4F
1061 +#define T58 0xFE2CE6E0
1062 +#define T59 0xA3014314
1063 +#define T60 0x4E0811A1
1064 +#define T61 0xF7537E82
1065 +#define T62 0xBD3AF235
1066 +#define T63 0x2AD7D2BB
1067 +#define T64 0xEB86D391
1068 +
1069 +#define ROTL_SSE2(x, n) { \
1070 +    __m128i s; \
1071 +    s = _mm_srli_epi32(x, 32 - n); \
1072 +    x = _mm_slli_epi32(x, n); \
1073 +    x = _mm_or_si128(x, s); \
1074 +};
1075 +
1076 +#define ROTL_AVX2(x, n) { \
1077 +    __m256i s; \
1078 +    s = _mm256_srli_epi32(x, 32 - n); \
1079 +    x = _mm256_slli_epi32(x, n); \
1080 +    x = _mm256_or_si256(x, s); \
1081 +};
1082 +
1083 +#define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z))
1084 +#define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y))
1085 +#define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z)
1086 +#define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff))))
1087 +
1088 +#define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
1089 +#define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y))
1090 +#define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z)
1091 +#define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff))))
1092 +
1093 +#define SET_SSE2(step, a, b, c, d, x, s, ac) { \
1094 +    a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \
1095 +    ROTL_SSE2(a, s); \
1096 +    a = _mm_add_epi32(a, b); \
1097 +}
1098 +
1099 +#define SET_AVX2(step, a, b, c, d, x, s, ac) { \
1100 +    a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \
1101 +    ROTL_AVX2(a, s); \
1102 +    a = _mm256_add_epi32(a, b); \
1103 +}
1104 +
1105 +#define IA 0x67452301
1106 +#define IB 0xefcdab89
1107 +#define IC 0x98badcfe
1108 +#define ID 0x10325476
1109 +
1110 +#define GET_MD5_DATA(dest, src, pos)         \
1111 +    dest =                                   \
1112 +        ((uint32_t) src[pos + 0]) <<  0 |    \
1113 +        ((uint32_t) src[pos + 1]) <<  8 |    \
1114 +        ((uint32_t) src[pos + 2]) << 16 |    \
1115 +        ((uint32_t) src[pos + 3]) << 24
1116 +
1117 +#define GET_PMD5_DATA_SSE2(dest, src, pos) { \
1118 +    uint32_t v0, v1, v2, v3;                 \
1119 +    GET_MD5_DATA(v0, src[0], pos);           \
1120 +    GET_MD5_DATA(v1, src[1], pos);           \
1121 +    GET_MD5_DATA(v2, src[2], pos);           \
1122 +    GET_MD5_DATA(v3, src[3], pos);           \
1123 +    dest = _mm_setr_epi32(v0, v1, v2, v3);   \
1124 +}
1125 +
1126 +#define GET_PMD5_DATA_AVX2(dest, src, pos) { \
1127 +    uint32_t v0, v1, v2, v3;                 \
1128 +    uint32_t v4, v5, v6, v7;                 \
1129 +    GET_MD5_DATA(v0, src[0], pos);           \
1130 +    GET_MD5_DATA(v1, src[1], pos);           \
1131 +    GET_MD5_DATA(v2, src[2], pos);           \
1132 +    GET_MD5_DATA(v3, src[3], pos);           \
1133 +    GET_MD5_DATA(v4, src[4], pos);           \
1134 +    GET_MD5_DATA(v5, src[5], pos);           \
1135 +    GET_MD5_DATA(v6, src[6], pos);           \
1136 +    GET_MD5_DATA(v7, src[7], pos);           \
1137 +    dest = _mm256_setr_epi32(v0, v1, v2, v3, \
1138 +                          v4, v5, v6, v7);   \
1139 +}
1140 +
1141 +#define PUT_MD5_DATA(dest, val, pos) {       \
1142 +    dest[pos + 0] = (val >>  0) & 0xff;      \
1143 +    dest[pos + 1] = (val >>  8) & 0xff;      \
1144 +    dest[pos + 2] = (val >> 16) & 0xff;      \
1145 +    dest[pos + 3] = (val >> 24) & 0xff;      \
1146 +}
1147 +
1148 +const static uint8_t md5_padding[64] = {
1149 +    0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1150 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1151 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1152 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1153 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1154 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1155 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1156 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1157 +};
1158 +
1159 +#ifdef PMD5_ALLOW_SSE2
1160 +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
1161 +{
1162 +    int i;
1163 +    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1164 +        ctx->len[i] = 0;
1165 +    }
1166 +
1167 +    ctx->state_sse2[0] = _mm_set1_epi32(IA);
1168 +    ctx->state_sse2[1] = _mm_set1_epi32(IB);
1169 +    ctx->state_sse2[2] = _mm_set1_epi32(IC);
1170 +    ctx->state_sse2[3] = _mm_set1_epi32(ID);
1171 +
1172 +    return PMD5_SUCCESS;
1173 +}
1174 +#endif
1175 +
1176 +#ifdef PMD5_ALLOW_AVX2
1177 +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
1178 +{
1179 +    int i;
1180 +    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1181 +        ctx->len[i] = 0;
1182 +    }
1183 +
1184 +    ctx->state_avx2[0] = _mm256_set1_epi32(IA);
1185 +    ctx->state_avx2[1] = _mm256_set1_epi32(IB);
1186 +    ctx->state_avx2[2] = _mm256_set1_epi32(IC);
1187 +    ctx->state_avx2[3] = _mm256_set1_epi32(ID);
1188 +
1189 +    return PMD5_SUCCESS;
1190 +}
1191 +#endif
1192 +
1193 +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
1194 +{
1195 +    return PMD5_INVALID_SLOT;
1196 +}
1197 +
1198 +#ifdef PMD5_ALLOW_SSE2
1199 +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1200 +{
1201 +    if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
1202 +        return PMD5_INVALID_SLOT;
1203 +
1204 +    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
1205 +    int i;
1206 +
1207 +    for (i = 0; i < 4; i++) {
1208 +        _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
1209 +    }
1210 +
1211 +    v[0][slot] = a;
1212 +    v[1][slot] = b;
1213 +    v[2][slot] = c;
1214 +    v[3][slot] = d;
1215 +
1216 +    for (i = 0; i < 4; i++) {
1217 +        ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]);
1218 +    }
1219 +
1220 +    return PMD5_SUCCESS;
1221 +}
1222 +#endif
1223 +
1224 +#ifdef PMD5_ALLOW_AVX2
1225 +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1226 +{
1227 +    if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
1228 +        return PMD5_INVALID_SLOT;
1229 +
1230 +    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
1231 +    int i;
1232 +
1233 +    for (i = 0; i < 4; i++) {
1234 +        _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
1235 +    }
1236 +
1237 +    v[0][slot] = a;
1238 +    v[1][slot] = b;
1239 +    v[2][slot] = c;
1240 +    v[3][slot] = d;
1241 +
1242 +    for (i = 0; i < 4; i++) {
1243 +        ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]);
1244 +    }
1245 +
1246 +    return PMD5_SUCCESS;
1247 +}
1248 +#endif
1249 +
1250 +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1251 +{
1252 +    return PMD5_INVALID_SLOT;
1253 +}
1254 +
1255 +#ifdef PMD5_ALLOW_SSE2
1256 +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
1257 +{
1258 +    if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
1259 +        return PMD5_INVALID_SLOT;
1260 +
1261 +    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
1262 +    int i;
1263 +
1264 +    for (i = 0; i < 4; i++) {
1265 +        _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
1266 +    }
1267 +
1268 +    *a = v[0][slot];
1269 +    *b = v[1][slot];
1270 +    *c = v[2][slot];
1271 +    *d = v[3][slot];
1272 +
1273 +    return PMD5_SUCCESS;
1274 +}
1275 +#endif
1276 +
1277 +#ifdef PMD5_ALLOW_AVX2
1278 +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
1279 +{
1280 +    if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
1281 +        return PMD5_INVALID_SLOT;
1282 +
1283 +    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
1284 +    int i;
1285 +
1286 +    for (i = 0; i < 4; i++) {
1287 +        _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
1288 +    }
1289 +
1290 +    *a = v[0][slot];
1291 +    *b = v[1][slot];
1292 +    *c = v[2][slot];
1293 +    *d = v[3][slot];
1294 +
1295 +    return PMD5_SUCCESS;
1296 +}
1297 +#endif
1298 +
1299 +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
1300 +{
1301 +    return PMD5_INVALID_SLOT;
1302 +}
1303 +
1304 +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot)
1305 +{
1306 +    return pmd5_set_slot(ctx, slot, IA, IB, IC, ID);
1307 +}
1308 +
1309 +#ifdef PMD5_ALLOW_SSE2
1310 +__attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
1311 +{
1312 +    __m128i W[MD5_DIGEST_LEN], a, b, c, d;
1313 +
1314 +    GET_PMD5_DATA_SSE2(W[ 0], data,  0);
1315 +    GET_PMD5_DATA_SSE2(W[ 1], data,  4);
1316 +    GET_PMD5_DATA_SSE2(W[ 2], data,  8);
1317 +    GET_PMD5_DATA_SSE2(W[ 3], data, 12);
1318 +    GET_PMD5_DATA_SSE2(W[ 4], data, 16);
1319 +    GET_PMD5_DATA_SSE2(W[ 5], data, 20);
1320 +    GET_PMD5_DATA_SSE2(W[ 6], data, 24);
1321 +    GET_PMD5_DATA_SSE2(W[ 7], data, 28);
1322 +    GET_PMD5_DATA_SSE2(W[ 8], data, 32);
1323 +    GET_PMD5_DATA_SSE2(W[ 9], data, 36);
1324 +    GET_PMD5_DATA_SSE2(W[10], data, 40);
1325 +    GET_PMD5_DATA_SSE2(W[11], data, 44);
1326 +    GET_PMD5_DATA_SSE2(W[12], data, 48);
1327 +    GET_PMD5_DATA_SSE2(W[13], data, 52);
1328 +    GET_PMD5_DATA_SSE2(W[14], data, 56);
1329 +    GET_PMD5_DATA_SSE2(W[15], data, 60);
1330 +
1331 +    a = ctx->state_sse2[0];
1332 +    b = ctx->state_sse2[1];
1333 +    c = ctx->state_sse2[2];
1334 +    d = ctx->state_sse2[3];
1335 +
1336 +    SET_SSE2(F, a, b, c, d, W[ 0], S11,  1);
1337 +    SET_SSE2(F, d, a, b, c, W[ 1], S12,  2);
1338 +    SET_SSE2(F, c, d, a, b, W[ 2], S13,  3);
1339 +    SET_SSE2(F, b, c, d, a, W[ 3], S14,  4);
1340 +    SET_SSE2(F, a, b, c, d, W[ 4], S11,  5);
1341 +    SET_SSE2(F, d, a, b, c, W[ 5], S12,  6);
1342 +    SET_SSE2(F, c, d, a, b, W[ 6], S13,  7);
1343 +    SET_SSE2(F, b, c, d, a, W[ 7], S14,  8);
1344 +    SET_SSE2(F, a, b, c, d, W[ 8], S11,  9);
1345 +    SET_SSE2(F, d, a, b, c, W[ 9], S12, 10);
1346 +    SET_SSE2(F, c, d, a, b, W[10], S13, 11);
1347 +    SET_SSE2(F, b, c, d, a, W[11], S14, 12);
1348 +    SET_SSE2(F, a, b, c, d, W[12], S11, 13);
1349 +    SET_SSE2(F, d, a, b, c, W[13], S12, 14);
1350 +    SET_SSE2(F, c, d, a, b, W[14], S13, 15);
1351 +    SET_SSE2(F, b, c, d, a, W[15], S14, 16);
1352 +
1353 +    SET_SSE2(G, a, b, c, d, W[ 1], S21, 17);
1354 +    SET_SSE2(G, d, a, b, c, W[ 6], S22, 18);
1355 +    SET_SSE2(G, c, d, a, b, W[11], S23, 19);
1356 +    SET_SSE2(G, b, c, d, a, W[ 0], S24, 20);
1357 +    SET_SSE2(G, a, b, c, d, W[ 5], S21, 21);
1358 +    SET_SSE2(G, d, a, b, c, W[10], S22, 22);
1359 +    SET_SSE2(G, c, d, a, b, W[15], S23, 23);
1360 +    SET_SSE2(G, b, c, d, a, W[ 4], S24, 24);
1361 +    SET_SSE2(G, a, b, c, d, W[ 9], S21, 25);
1362 +    SET_SSE2(G, d, a, b, c, W[14], S22, 26);
1363 +    SET_SSE2(G, c, d, a, b, W[ 3], S23, 27);
1364 +    SET_SSE2(G, b, c, d, a, W[ 8], S24, 28);
1365 +    SET_SSE2(G, a, b, c, d, W[13], S21, 29);
1366 +    SET_SSE2(G, d, a, b, c, W[ 2], S22, 30);
1367 +    SET_SSE2(G, c, d, a, b, W[ 7], S23, 31);
1368 +    SET_SSE2(G, b, c, d, a, W[12], S24, 32);
1369 +
1370 +    SET_SSE2(H, a, b, c, d, W[ 5], S31, 33);
1371 +    SET_SSE2(H, d, a, b, c, W[ 8], S32, 34);
1372 +    SET_SSE2(H, c, d, a, b, W[11], S33, 35);
1373 +    SET_SSE2(H, b, c, d, a, W[14], S34, 36);
1374 +    SET_SSE2(H, a, b, c, d, W[ 1], S31, 37);
1375 +    SET_SSE2(H, d, a, b, c, W[ 4], S32, 38);
1376 +    SET_SSE2(H, c, d, a, b, W[ 7], S33, 39);
1377 +    SET_SSE2(H, b, c, d, a, W[10], S34, 40);
1378 +    SET_SSE2(H, a, b, c, d, W[13], S31, 41);
1379 +    SET_SSE2(H, d, a, b, c, W[ 0], S32, 42);
1380 +    SET_SSE2(H, c, d, a, b, W[ 3], S33, 43);
1381 +    SET_SSE2(H, b, c, d, a, W[ 6], S34, 44);
1382 +    SET_SSE2(H, a, b, c, d, W[ 9], S31, 45);
1383 +    SET_SSE2(H, d, a, b, c, W[12], S32, 46);
1384 +    SET_SSE2(H, c, d, a, b, W[15], S33, 47);
1385 +    SET_SSE2(H, b, c, d, a, W[ 2], S34, 48);
1386 +
1387 +    SET_SSE2(I, a, b, c, d, W[ 0], S41, 49);
1388 +    SET_SSE2(I, d, a, b, c, W[ 7], S42, 50);
1389 +    SET_SSE2(I, c, d, a, b, W[14], S43, 51);
1390 +    SET_SSE2(I, b, c, d, a, W[ 5], S44, 52);
1391 +    SET_SSE2(I, a, b, c, d, W[12], S41, 53);
1392 +    SET_SSE2(I, d, a, b, c, W[ 3], S42, 54);
1393 +    SET_SSE2(I, c, d, a, b, W[10], S43, 55);
1394 +    SET_SSE2(I, b, c, d, a, W[ 1], S44, 56);
1395 +    SET_SSE2(I, a, b, c, d, W[ 8], S41, 57);
1396 +    SET_SSE2(I, d, a, b, c, W[15], S42, 58);
1397 +    SET_SSE2(I, c, d, a, b, W[ 6], S43, 59);
1398 +    SET_SSE2(I, b, c, d, a, W[13], S44, 60);
1399 +    SET_SSE2(I, a, b, c, d, W[ 4], S41, 61);
1400 +    SET_SSE2(I, d, a, b, c, W[11], S42, 62);
1401 +    SET_SSE2(I, c, d, a, b, W[ 2], S43, 63);
1402 +    SET_SSE2(I, b, c, d, a, W[ 9], S44, 64);
1403 +
1404 +    ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a);
1405 +    ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b);
1406 +    ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c);
1407 +    ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d);
1408 +}
1409 +#endif
1410 +
1411 +#ifdef PMD5_ALLOW_AVX2
1412 +__attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
1413 +{
1414 +    __m256i W[MD5_DIGEST_LEN], a, b, c, d;
1415 +
1416 +    GET_PMD5_DATA_AVX2(W[ 0], data,  0);
1417 +    GET_PMD5_DATA_AVX2(W[ 1], data,  4);
1418 +    GET_PMD5_DATA_AVX2(W[ 2], data,  8);
1419 +    GET_PMD5_DATA_AVX2(W[ 3], data, 12);
1420 +    GET_PMD5_DATA_AVX2(W[ 4], data, 16);
1421 +    GET_PMD5_DATA_AVX2(W[ 5], data, 20);
1422 +    GET_PMD5_DATA_AVX2(W[ 6], data, 24);
1423 +    GET_PMD5_DATA_AVX2(W[ 7], data, 28);
1424 +    GET_PMD5_DATA_AVX2(W[ 8], data, 32);
1425 +    GET_PMD5_DATA_AVX2(W[ 9], data, 36);
1426 +    GET_PMD5_DATA_AVX2(W[10], data, 40);
1427 +    GET_PMD5_DATA_AVX2(W[11], data, 44);
1428 +    GET_PMD5_DATA_AVX2(W[12], data, 48);
1429 +    GET_PMD5_DATA_AVX2(W[13], data, 52);
1430 +    GET_PMD5_DATA_AVX2(W[14], data, 56);
1431 +    GET_PMD5_DATA_AVX2(W[15], data, 60);
1432 +
1433 +    a = ctx->state_avx2[0];
1434 +    b = ctx->state_avx2[1];
1435 +    c = ctx->state_avx2[2];
1436 +    d = ctx->state_avx2[3];
1437 +
1438 +    SET_AVX2(F, a, b, c, d, W[ 0], S11,  1);
1439 +    SET_AVX2(F, d, a, b, c, W[ 1], S12,  2);
1440 +    SET_AVX2(F, c, d, a, b, W[ 2], S13,  3);
1441 +    SET_AVX2(F, b, c, d, a, W[ 3], S14,  4);
1442 +    SET_AVX2(F, a, b, c, d, W[ 4], S11,  5);
1443 +    SET_AVX2(F, d, a, b, c, W[ 5], S12,  6);
1444 +    SET_AVX2(F, c, d, a, b, W[ 6], S13,  7);
1445 +    SET_AVX2(F, b, c, d, a, W[ 7], S14,  8);
1446 +    SET_AVX2(F, a, b, c, d, W[ 8], S11,  9);
1447 +    SET_AVX2(F, d, a, b, c, W[ 9], S12, 10);
1448 +    SET_AVX2(F, c, d, a, b, W[10], S13, 11);
1449 +    SET_AVX2(F, b, c, d, a, W[11], S14, 12);
1450 +    SET_AVX2(F, a, b, c, d, W[12], S11, 13);
1451 +    SET_AVX2(F, d, a, b, c, W[13], S12, 14);
1452 +    SET_AVX2(F, c, d, a, b, W[14], S13, 15);
1453 +    SET_AVX2(F, b, c, d, a, W[15], S14, 16);
1454 +
1455 +    SET_AVX2(G, a, b, c, d, W[ 1], S21, 17);
1456 +    SET_AVX2(G, d, a, b, c, W[ 6], S22, 18);
1457 +    SET_AVX2(G, c, d, a, b, W[11], S23, 19);
1458 +    SET_AVX2(G, b, c, d, a, W[ 0], S24, 20);
1459 +    SET_AVX2(G, a, b, c, d, W[ 5], S21, 21);
1460 +    SET_AVX2(G, d, a, b, c, W[10], S22, 22);
1461 +    SET_AVX2(G, c, d, a, b, W[15], S23, 23);
1462 +    SET_AVX2(G, b, c, d, a, W[ 4], S24, 24);
1463 +    SET_AVX2(G, a, b, c, d, W[ 9], S21, 25);
1464 +    SET_AVX2(G, d, a, b, c, W[14], S22, 26);
1465 +    SET_AVX2(G, c, d, a, b, W[ 3], S23, 27);
1466 +    SET_AVX2(G, b, c, d, a, W[ 8], S24, 28);
1467 +    SET_AVX2(G, a, b, c, d, W[13], S21, 29);
1468 +    SET_AVX2(G, d, a, b, c, W[ 2], S22, 30);
1469 +    SET_AVX2(G, c, d, a, b, W[ 7], S23, 31);
1470 +    SET_AVX2(G, b, c, d, a, W[12], S24, 32);
1471 +
1472 +    SET_AVX2(H, a, b, c, d, W[ 5], S31, 33);
1473 +    SET_AVX2(H, d, a, b, c, W[ 8], S32, 34);
1474 +    SET_AVX2(H, c, d, a, b, W[11], S33, 35);
1475 +    SET_AVX2(H, b, c, d, a, W[14], S34, 36);
1476 +    SET_AVX2(H, a, b, c, d, W[ 1], S31, 37);
1477 +    SET_AVX2(H, d, a, b, c, W[ 4], S32, 38);
1478 +    SET_AVX2(H, c, d, a, b, W[ 7], S33, 39);
1479 +    SET_AVX2(H, b, c, d, a, W[10], S34, 40);
1480 +    SET_AVX2(H, a, b, c, d, W[13], S31, 41);
1481 +    SET_AVX2(H, d, a, b, c, W[ 0], S32, 42);
1482 +    SET_AVX2(H, c, d, a, b, W[ 3], S33, 43);
1483 +    SET_AVX2(H, b, c, d, a, W[ 6], S34, 44);
1484 +    SET_AVX2(H, a, b, c, d, W[ 9], S31, 45);
1485 +    SET_AVX2(H, d, a, b, c, W[12], S32, 46);
1486 +    SET_AVX2(H, c, d, a, b, W[15], S33, 47);
1487 +    SET_AVX2(H, b, c, d, a, W[ 2], S34, 48);
1488 +
1489 +    SET_AVX2(I, a, b, c, d, W[ 0], S41, 49);
1490 +    SET_AVX2(I, d, a, b, c, W[ 7], S42, 50);
1491 +    SET_AVX2(I, c, d, a, b, W[14], S43, 51);
1492 +    SET_AVX2(I, b, c, d, a, W[ 5], S44, 52);
1493 +    SET_AVX2(I, a, b, c, d, W[12], S41, 53);
1494 +    SET_AVX2(I, d, a, b, c, W[ 3], S42, 54);
1495 +    SET_AVX2(I, c, d, a, b, W[10], S43, 55);
1496 +    SET_AVX2(I, b, c, d, a, W[ 1], S44, 56);
1497 +    SET_AVX2(I, a, b, c, d, W[ 8], S41, 57);
1498 +    SET_AVX2(I, d, a, b, c, W[15], S42, 58);
1499 +    SET_AVX2(I, c, d, a, b, W[ 6], S43, 59);
1500 +    SET_AVX2(I, b, c, d, a, W[13], S44, 60);
1501 +    SET_AVX2(I, a, b, c, d, W[ 4], S41, 61);
1502 +    SET_AVX2(I, d, a, b, c, W[11], S42, 62);
1503 +    SET_AVX2(I, c, d, a, b, W[ 2], S43, 63);
1504 +    SET_AVX2(I, b, c, d, a, W[ 9], S44, 64);
1505 +
1506 +    ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a);
1507 +    ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b);
1508 +    ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c);
1509 +    ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d);
1510 +}
1511 +#endif
1512 +
1513 +__attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
1514 +{
1515 +}
1516 +
1517 +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride)
1518 +{
1519 +    const uint8_t * ptrs[PMD5_SLOTS_MAX];
1520 +
1521 +    if (!length) return PMD5_SUCCESS;
1522 +
1523 +    int slots = pmd5_slots();
1524 +
1525 +    if (!stride) stride = 64;
1526 +
1527 +    int i;
1528 +    for (i = 0; i < slots; i++) {
1529 +        ptrs[i] = data[i];
1530 +        ctx->len[i] += length;
1531 +        if (!ptrs[i]) ptrs[i] = md5_padding;
1532 +    }
1533 +
1534 +    while (length >= 64) {
1535 +        pmd5_process(ctx, ptrs);
1536 +        length -= 64;
1537 +        for (i = 0; i < slots; i++) {
1538 +            if (data[i]) ptrs[i] += stride;
1539 +        }
1540 +    }
1541 +
1542 +    if (length) return PMD5_UNALIGNED_UPDATE;
1543 +
1544 +    for (i = 0; i < slots; i++) {
1545 +        if (data[i]) data[i] = ptrs[i];
1546 +    }
1547 +
1548 +    return PMD5_SUCCESS;
1549 +}
1550 +
1551 +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX])
1552 +{
1553 +    uint64_t length = 0;
1554 +    int slots = pmd5_slots();
1555 +
1556 +    int i;
1557 +    for (i = 0; i < slots; i++) {
1558 +        if ((length == 0) || (lengths[i] < length)) length = lengths[i];
1559 +    }
1560 +
1561 +    for (i = 0; i < slots; i++) {
1562 +        lengths[i] -= length;
1563 +    }
1564 +
1565 +    return pmd5_update_all_simple(ctx, data, length, 0);
1566 +}
1567 +
1568 +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length)
1569 +{
1570 +    MD5_CTX ctx;
1571 +
1572 +    if ((slot >= pmd5_slots()) || (slot < 0))
1573 +        return PMD5_INVALID_SLOT;
1574 +
1575 +    pmd5_to_md5(pctx, &ctx, slot);
1576 +    if (data && length) {
1577 +        MD5_Update(&ctx, data, length);
1578 +    }
1579 +    MD5_Final(digest, &ctx);
1580 +
1581 +    return PMD5_SUCCESS;
1582 +}
1583 +
1584 +static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot)
1585 +{
1586 +    return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0);
1587 +}
1588 +
1589 +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN])
1590 +{
1591 +    int i;
1592 +    for (i = 0; i < pmd5_slots(); i++) {
1593 +        pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0);
1594 +    }
1595 +    return PMD5_SUCCESS;
1596 +}
1597 +
1598 +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot)
1599 +{
1600 +    if ((slot >= pmd5_slots()) || (slot < 0))
1601 +        return PMD5_INVALID_SLOT;
1602 +
1603 +    // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting.
1604 +
1605 +#ifdef USE_OPENSSL
1606 +    pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29);
1607 +#else
1608 +    pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32);
1609 +#endif
1610 +    return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D);
1611 +}
1612 +
1613 +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot)
1614 +{
1615 +    if ((slot >= pmd5_slots()) || (slot < 0))
1616 +        return PMD5_INVALID_SLOT;
1617 +
1618 +    MD5_Init(ctx);
1619 +
1620 +#ifdef USE_OPENSSL
1621 +    ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF;
1622 +    ctx->Nh = pctx->len[slot] >> 29;
1623 +
1624 +    uint32_t a, b, c, d;
1625 +    pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
1626 +    if (ret == PMD5_SUCCESS) {
1627 +        ctx->A = a;
1628 +        ctx->B = b;
1629 +        ctx->C = c;
1630 +        ctx->D = d;
1631 +    }
1632 +    return ret;
1633 +#else
1634 +    ctx->totalN = pctx->len[slot] & 0xFFFFFFFF;
1635 +    ctx->totalN2 = pctx->len[slot] >> 32;
1636 +    return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D);
1637 +#endif
1638 +}
1639 +
1640 +/* With GCC 10 putting these implementations inside 'extern "C"' causes an
1641 +   assembler error. That worked fine on GCC 5-9 and clang 6-10...
1642 +  */
1643 +
1644 +static inline int md5_parallel_slots_cpp()
1645 +{
1646 +    int slots = pmd5_slots();
1647 +    if (slots == 0) return 1;
1648 +    return slots;
1649 +}
1650 +
1651 +static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
1652 +{
1653 +    int slots = md5_parallel_slots_cpp();
1654 +    if ((streams < 1) || (streams > slots)) return 0;
1655 +    if (pre4 && post4) return 0;
1656 +
1657 +    if (slots == 1) {
1658 +        MD5_CTX ctx;
1659 +        MD5_Init(&ctx);
1660 +        if (pre4) {
1661 +            MD5_Update(&ctx, (const unsigned char*)pre4, 4);
1662 +        }
1663 +        MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]);
1664 +        if (post4) {
1665 +            MD5_Update(&ctx, (const unsigned char*)post4, 4);
1666 +        }
1667 +        if (sum[0]) {
1668 +            MD5_Final((uint8_t*)sum[0], &ctx);
1669 +        }
1670 +        return 0;
1671 +    }
1672 +
1673 +    int i;
1674 +    int active[PMD5_SLOTS_MAX];
1675 +    char* buffers[PMD5_SLOTS_MAX];
1676 +    uint64_t left[PMD5_SLOTS_MAX];
1677 +    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1678 +        active[i] = streams > i;
1679 +        if (i < streams) {
1680 +            buffers[i] = buf[i];
1681 +            left[i] = (uint64_t)len[i];
1682 +        } else {
1683 +            buffers[i] = NULL;
1684 +            left[i] = 0;
1685 +        }
1686 +    }
1687 +    MD5_CTX results[PMD5_SLOTS_MAX];
1688 +
1689 +    pmd5_context ctx_simd;
1690 +    if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0;
1691 +
1692 +    if (pre4) {
1693 +        char temp_buffers[PMD5_SLOTS_MAX][64];
1694 +        int have_any = 0;
1695 +        for (i = 0; i < slots; i++) {
1696 +            if (active[i]) {
1697 +                if (left[i] < 60) {
1698 +                    MD5_Init(&results[i]);
1699 +                    MD5_Update(&results[i], (const unsigned char*)pre4, 4);
1700 +                    MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]);
1701 +                    active[i] = 0;
1702 +                    left[i] = 0;
1703 +                } else {
1704 +                    memcpy(temp_buffers[i], pre4, 4);
1705 +                    memcpy(temp_buffers[i] + 4, buffers[i], 60);
1706 +                    buffers[i] += 60;
1707 +                    left[i] -= 60;
1708 +                    have_any = 1;
1709 +                }
1710 +            }
1711 +        }
1712 +
1713 +        if (have_any) {
1714 +            char* ptrs[PMD5_SLOTS_MAX];
1715 +            for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1716 +                ptrs[i] = &temp_buffers[i][0];
1717 +            }
1718 +            if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) {
1719 +                return 0;
1720 +            }
1721 +        }
1722 +    }
1723 +
1724 +    int failed = 0;
1725 +    while (true) {
1726 +        for (i = 0; i < slots; i++) {
1727 +            if (active[i] && (left[i] < 64)) {
1728 +                if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) {
1729 +                    failed = 1;
1730 +                }
1731 +                active[i] = 0;
1732 +            }
1733 +        }
1734 +
1735 +        uint64_t shortest = 0;
1736 +        for (i = 0; i < slots; i++) {
1737 +            if (!active[i]) {
1738 +                buffers[i] = NULL;
1739 +            } else if ((shortest == 0) || (left[i] < shortest)) {
1740 +                shortest = left[i];
1741 +            }
1742 +        }
1743 +
1744 +        if (shortest > 0) {
1745 +            shortest = shortest & ~63;
1746 +            if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) {
1747 +                failed = 1;
1748 +            }
1749 +            for (i = 0; i < slots; i++) {
1750 +                if (active[i]) {
1751 +                    left[i] -= shortest;
1752 +                }
1753 +            }
1754 +        }
1755 +
1756 +        if (failed) {
1757 +            return 0;
1758 +        } else {
1759 +            int have_any = 0;
1760 +            for (i = 0; i < slots; i++) {
1761 +                have_any |= active[i];
1762 +            }
1763 +            if (!have_any) {
1764 +                break;
1765 +            }
1766 +        }
1767 +    }
1768 +
1769 +    for (i = 0; i < slots; i++) {
1770 +        if (i < streams) {
1771 +            if (left[i] > 0) {
1772 +                // buffer[i] == NULL here
1773 +                MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]);
1774 +            }
1775 +            if (post4) {
1776 +                MD5_Update(&results[i], (const unsigned char*)post4, 4);
1777 +            }
1778 +            if (sum[i]) {
1779 +                MD5_Final((uint8_t*)sum[i], &results[i]);
1780 +            }
1781 +        }
1782 +    }
1783 +
1784 +    return 1;
1785 +}
1786 +
1787 +// each pmd5_context needs to be 32-byte aligned
1788 +#define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31)))
1789 +
1790 +static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx)
1791 +{
1792 +    int i;
1793 +    for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) {
1794 +        pmd5_init_all(MD5P8_Contexts_simd(ctx, i));
1795 +    }
1796 +    ctx->used = 0;
1797 +    ctx->next = 0;
1798 +}
1799 +
1800 +static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length)
1801 +{
1802 +    int slots = pmd5_slots();
1803 +    uint32 pos = 0;
1804 +
1805 +    if ((ctx->used) || (length < 512)) {
1806 +        int cpy = MIN(length, 512 - ctx->used);
1807 +        memcpy(&ctx->buffer[ctx->used], input, cpy);
1808 +        ctx->used += cpy;
1809 +        length -= cpy;
1810 +        pos += cpy;
1811 +
1812 +        if (ctx->used == 512) {
1813 +            if (slots == PMD5_SLOTS_AVX2) {
1814 +                const uint8_t* ptrs[PMD5_SLOTS_MAX] = {
1815 +                    (uint8_t*)ctx->buffer,
1816 +                    (uint8_t*)(ctx->buffer + 64),
1817 +                    (uint8_t*)(ctx->buffer + 128),
1818 +                    (uint8_t*)(ctx->buffer + 192),
1819 +                    (uint8_t*)(ctx->buffer + 256),
1820 +                    (uint8_t*)(ctx->buffer + 320),
1821 +                    (uint8_t*)(ctx->buffer + 384),
1822 +                    (uint8_t*)(ctx->buffer + 448)
1823 +                };
1824 +                pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0);
1825 +            } else {
1826 +                const uint8_t* ptrs1[PMD5_SLOTS_MAX] = {
1827 +                    (uint8_t*)ctx->buffer,
1828 +                    (uint8_t*)(ctx->buffer + 64),
1829 +                    (uint8_t*)(ctx->buffer + 128),
1830 +                    (uint8_t*)(ctx->buffer + 192)
1831 +                };
1832 +                const uint8_t* ptrs2[PMD5_SLOTS_MAX] = {
1833 +                    (uint8_t*)(ctx->buffer + 256),
1834 +                    (uint8_t*)(ctx->buffer + 320),
1835 +                    (uint8_t*)(ctx->buffer + 384),
1836 +                    (uint8_t*)(ctx->buffer + 448)
1837 +                };
1838 +                pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0);
1839 +                pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0);
1840 +            }
1841 +            ctx->used = 0;
1842 +        }
1843 +    }
1844 +
1845 +    if (length >= 512) {
1846 +        uint32 blocks = length / 512;
1847 +        if (slots == PMD5_SLOTS_AVX2) {
1848 +            const uint8_t* ptrs[8] = {
1849 +                (uint8_t*)(input + pos),
1850 +                (uint8_t*)(input + pos + 64),
1851 +                (uint8_t*)(input + pos + 128),
1852 +                (uint8_t*)(input + pos + 192),
1853 +                (uint8_t*)(input + pos + 256),
1854 +                (uint8_t*)(input + pos + 320),
1855 +                (uint8_t*)(input + pos + 384),
1856 +                (uint8_t*)(input + pos + 448)
1857 +            };
1858 +            pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512);
1859 +        } else {
1860 +            const uint8_t* ptrs1[4] = {
1861 +                (uint8_t*)(input + pos),
1862 +                (uint8_t*)(input + pos + 64),
1863 +                (uint8_t*)(input + pos + 128),
1864 +                (uint8_t*)(input + pos + 192)
1865 +            };
1866 +            const uint8_t* ptrs2[4] = {
1867 +                (uint8_t*)(input + pos + 256),
1868 +                (uint8_t*)(input + pos + 320),
1869 +                (uint8_t*)(input + pos + 384),
1870 +                (uint8_t*)(input + pos + 448)
1871 +            };
1872 +            pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512);
1873 +            pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512);
1874 +        }
1875 +        pos += blocks * 512;
1876 +        length -= blocks * 512;
1877 +    }
1878 +
1879 +    if (length) {
1880 +        memcpy(ctx->buffer, &input[pos], length);
1881 +        ctx->used = length;
1882 +    }
1883 +}
1884 +
1885 +static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
1886 +{
1887 +    int i;
1888 +    uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0;
1889 +    if (ctx->used) {
1890 +        uchar tmp[512];
1891 +        memset(tmp, 0, 512);
1892 +        MD5P8_Update(ctx, tmp, 512 - ctx->used);
1893 +    }
1894 +
1895 +    uchar state[34*4] = {0};
1896 +
1897 +    MD5_CTX tmp;
1898 +    for (i = 0; i < 8; i++) {
1899 +        if (pmd5_slots() == PMD5_SLOTS_AVX2) {
1900 +            pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
1901 +        } else if (i < 4) {
1902 +            pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
1903 +        } else {
1904 +            pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4);
1905 +        }
1906 +#ifdef USE_OPENSSL
1907 +        if (low + tmp.Nl < low) high++;
1908 +        low += tmp.Nl;
1909 +        high += tmp.Nh;
1910 +#else
1911 +        if (low + tmp.totalN < low) high++;
1912 +        low += tmp.totalN;
1913 +        high += tmp.totalN2;
1914 +#endif
1915 +        SIVALu(state, i*16, tmp.A);
1916 +        SIVALu(state, i*16 + 4, tmp.B);
1917 +        SIVALu(state, i*16 + 8, tmp.C);
1918 +        SIVALu(state, i*16 + 12, tmp.D);
1919 +    }
1920 +
1921 +#ifndef USE_OPENSSL
1922 +    high = (low >> 29) | (high << 3);
1923 +    low = (low << 3);
1924 +#endif
1925 +
1926 +    sub <<= 3;
1927 +    if (low - sub > low) high--;
1928 +    low -= sub;
1929 +
1930 +    SIVALu(state, 32*4, low);
1931 +    SIVALu(state, 33*4, high);
1932 +
1933 +    MD5_CTX md;
1934 +    MD5_Init(&md);
1935 +    MD5_Update(&md, state, 34*4);
1936 +    MD5_Final(digest, &md);
1937 +}
1938 +
1939 +extern "C" {
1940 +
1941 +int md5_parallel_slots()
1942 +{
1943 +    return md5_parallel_slots_cpp();
1944 +}
1945 +
1946 +int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
1947 +{
1948 +    return md5_parallel_cpp(streams, buf, len, sum, pre4, post4);
1949 +}
1950 +
1951 +void MD5P8_Init(MD5P8_CTX *ctx)
1952 +{
1953 +    MD5P8_Init_cpp(ctx);
1954 +}
1955 +
1956 +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
1957 +{
1958 +    MD5P8_Update_cpp(ctx, input, length);
1959 +}
1960 +
1961 +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
1962 +{
1963 +    MD5P8_Final_cpp(digest, ctx);
1964 +}
1965 +
1966 +} // "C"
1967 +
1968 +#endif /* HAVE_SIMD */
1969 +#endif /* __cplusplus */
1970 +#endif /* __x86_64__ */