1 From: Jorrit Jongma <git@jongma.org>
3 - MD5 optimization in block matching phase:
5 MD5 hashes computed during rsync's block matching phase are independent
6 and thus possible to process in parallel. This code processes 4 blocks
7 in parallel if SSE2 is available, or 8 if AVX2 is available. An increase
8 of performance (or decrease of CPU usage) of up to 6x has been measured.
10 A prefetching algorithm is used to predict and load upcoming blocks, as
11 this prevents the need for extensive modifications to other parts of
12 the rsync sources to get this working.
14 This remains compatible with existing rsync builds using MD5 checksums.
16 - MD5P8 whole-file checksum:
18 Splits the input up into 8 independent streams (64-byte interleave), and
19 produces a final checksum based on the end state of those 8 streams. If
20 parallelization of MD5 hashing is available, the performance gain (or
21 CPU usage decrease) is 2x to 6x compared to traditional MD5.
23 The rsync version on both ends of the connection need MD5P8 support
24 built-in for it to be used.
26 xxHash is still preferred (and faster), but this provides a reasonably
27 fast fallback for the case where xxHash libraries are not available at
30 based-on: 194cee671d5e178f20c4494f41911fa8db942935
31 diff --git a/Makefile.in b/Makefile.in
34 @@ -29,14 +29,14 @@ SHELL=/bin/sh
38 -SIMD_x86_64=simd-checksum-x86_64.o
39 +SIMD_x86_64=simd-checksum-x86_64.o simd-md5-parallel-x86_64.o
40 ASM_x86_64=lib/md5-asm-x86_64.o
42 GENFILES=configure.sh aclocal.m4 config.h.in proto.h proto.h-tstamp rsync.1 rsync.1.html \
43 rsync-ssl.1 rsync-ssl.1.html rsyncd.conf.5 rsyncd.conf.5.html
44 HEADERS=byteorder.h config.h errcode.h proto.h rsync.h ifuncs.h itypes.h inums.h \
45 lib/pool_alloc.h lib/mdigest.h lib/md-defines.h version.h
46 -LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o \
47 +LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o lib/md5p8.o \
48 lib/permstring.o lib/pool_alloc.o lib/sysacls.o lib/sysxattrs.o @LIBOBJS@
49 zlib_OBJS=zlib/deflate.o zlib/inffast.o zlib/inflate.o zlib/inftrees.o \
50 zlib/trees.o zlib/zutil.o zlib/adler32.o zlib/compress.o zlib/crc32.o
51 @@ -135,6 +135,9 @@ rounding.h: rounding.c rsync.h proto.h
52 simd-checksum-x86_64.o: simd-checksum-x86_64.cpp
53 @$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-checksum-x86_64.cpp
55 +simd-md5-parallel-x86_64.o: simd-md5-parallel-x86_64.cpp
56 + @$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-md5-parallel-x86_64.cpp
58 lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.S config.h lib/md-defines.h
59 @$(srcdir)/cmdormsg disable-asm $(CC) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/lib/md5-asm-x86_64.S
61 diff --git a/checksum.c b/checksum.c
64 @@ -52,6 +52,7 @@ struct name_num_obj valid_checksums = {
65 { CSUM_XXH64, "xxh64", NULL },
66 { CSUM_XXH64, "xxhash", NULL },
68 + { CSUM_MD5P8, "md5p8", NULL },
69 { CSUM_MD5, "md5", NULL },
70 { CSUM_MD4, "md4", NULL },
71 { CSUM_NONE, "none", NULL },
72 @@ -141,6 +142,7 @@ int csum_len_for_type(int cst, BOOL flist_csum)
75 return MD4_DIGEST_LEN;
78 return MD5_DIGEST_LEN;
80 @@ -167,6 +169,7 @@ int canonical_checksum(int csum_type)
88 @@ -179,7 +182,9 @@ int canonical_checksum(int csum_type)
92 -#ifndef HAVE_SIMD /* See simd-checksum-*.cpp. */
93 +#ifdef HAVE_SIMD /* See simd-checksum-*.cpp. */
94 +#define get_checksum2 get_checksum2_nosimd
97 a simple 32 bit checksum that can be updated from either end
98 (inspired by Mark Adler's Adler-32 checksum)
99 @@ -200,9 +205,18 @@ uint32 get_checksum1(char *buf1, int32 len)
101 return (s1 & 0xffff) + (s2 << 16);
104 +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
108 +void checksum2_disable_prefetch()
113 -void get_checksum2(char *buf, int32 len, char *sum)
114 +/* Renamed to get_checksum2_nosimd() with HAVE_SIMD */
115 +void get_checksum2(char *buf, int32 len, char *sum, UNUSED(OFF_T prefetch_offset))
117 switch (xfersum_type) {
118 #ifdef SUPPORT_XXHASH
119 @@ -221,6 +235,7 @@ void get_checksum2(char *buf, int32 len, char *sum)
123 + case CSUM_MD5P8: /* == CSUM_MD5 for checksum2 */
127 @@ -373,6 +388,21 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
136 + for (i = 0; i + CHUNK_SIZE <= len; i += CHUNK_SIZE)
137 + MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, CHUNK_SIZE), CHUNK_SIZE);
139 + remainder = (int32)(len - i);
141 + MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, remainder), remainder);
143 + MD5P8_Final((uchar *)sum, &m5p8);
149 @@ -445,6 +475,7 @@ static union {
153 +static MD5P8_CTX m5p8;
154 #ifdef SUPPORT_XXHASH
155 static XXH64_state_t* xxh64_state;
157 @@ -481,6 +512,9 @@ void sum_init(int csum_type, int seed)
158 XXH3_128bits_reset(xxh3_state);
167 @@ -531,6 +565,9 @@ void sum_update(const char *p, int32 len)
168 XXH3_128bits_update(xxh3_state, p, len);
172 + MD5P8_Update(&m5p8, (uchar *)p, len);
175 MD5_Update(&ctx.m5, (uchar *)p, len);
177 @@ -596,6 +633,9 @@ int sum_end(char *sum)
182 + MD5P8_Final((uchar *)sum, &m5p8);
185 MD5_Final((uchar *)sum, &ctx.m5);
187 diff --git a/generator.c b/generator.c
190 @@ -708,10 +708,12 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
191 if (append_mode > 0 && f_copy < 0)
196 mapbuf = map_file(fd, len, MAX_MAP_SIZE, sum.blength);
198 + checksum2_enable_prefetch(mapbuf, len, sum.blength);
203 for (i = 0; i < sum.count; i++) {
204 int32 n1 = (int32)MIN(len, (OFF_T)sum.blength);
205 @@ -729,7 +731,7 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
208 sum1 = get_checksum1(map, n1);
209 - get_checksum2(map, n1, sum2);
210 + get_checksum2(map, n1, sum2, offset - n1);
212 if (DEBUG_GTE(DELTASUM, 3)) {
214 @@ -741,8 +743,10 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
215 write_buf(f_out, sum2, sum.s2length);
221 + checksum2_disable_prefetch();
226 diff --git a/lib/md-defines.h b/lib/md-defines.h
227 --- a/lib/md-defines.h
228 +++ b/lib/md-defines.h
231 #define CSUM_XXH3_64 7
232 #define CSUM_XXH3_128 8
233 +#define CSUM_MD5P8 9
234 diff --git a/lib/md5p8.c b/lib/md5p8.c
240 + * MD5-based hash friendly to parallel processing, reference implementation
242 + * Author: Jorrit Jongma, 2020
244 + * Released in the public domain falling back to the MIT license
245 + * ( http://www.opensource.org/licenses/MIT ) in case public domain does not
246 + * apply in your country.
249 + * MD5P8 is an MD5-based hash friendly to parallel processing. The input
250 + * stream is divided into 8 independent streams. For each 512 bytes of input,
251 + * the first 64 bytes are send to the first stream, the second 64 bytes to
252 + * the second stream, etc. The input stream is padded with zeros to the next
253 + * multiple of 512 bytes, then a normal MD5 hash is computed on a buffer
254 + * containing the A, B, C, and D states of the 8 individual streams, followed
255 + * by the (unpadded) length of the input.
257 + * On non-SIMD accelerated CPUs the performance of MD5P8 is slightly lower
258 + * than normal MD5 (particularly on files smaller than 10 kB), but with
259 + * SIMD-based parallel processing it can be two to six times as fast. Even in
260 + * the best-case scenario, xxHash is still at least twice as fast and should
261 + * be preferred when available.
267 +#define MD5P8_Init MD5P8_Init_c
268 +#define MD5P8_Update MD5P8_Update_c
269 +#define MD5P8_Final MD5P8_Final_c
272 +/* each MD5_CTX needs to be 8-byte aligned */
273 +#define MD5P8_Contexts_c(ctx, index) ((MD5_CTX*)((((uintptr_t)((ctx)->context_storage) + 7) & ~7) + (index)*((sizeof(MD5_CTX) + 7) & ~7)))
275 +void MD5P8_Init(MD5P8_CTX *ctx)
278 + for (i = 0; i < 8; i++) {
279 + MD5_Init(MD5P8_Contexts_c(ctx, i));
285 +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
289 + if ((ctx->used) || (length < 64)) {
290 + int cpy = MIN(length, 64 - ctx->used);
291 + memmove(&ctx->buffer[ctx->used], input, cpy);
296 + if (ctx->used == 64) {
297 + MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), ctx->buffer, 64);
299 + ctx->next = (ctx->next + 1) % 8;
303 + while (length >= 64) {
304 + MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), &input[pos], 64);
305 + ctx->next = (ctx->next + 1) % 8;
311 + memcpy(ctx->buffer, &input[pos], length);
312 + ctx->used = length;
316 +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
319 + uint32 low = 0, high = 0, sub = ctx->used ? 64 - ctx->used : 0;
322 + memset(tmp, 0, 64);
323 + MD5P8_Update(ctx, tmp, 64 - ctx->used);
325 + memset(ctx->buffer, 0, 64);
326 + while (ctx->next != 0) {
327 + MD5P8_Update(ctx, ctx->buffer, 64);
331 + uchar state[34*4] = {0};
333 + for (i = 0; i < 8; i++) {
334 + MD5_CTX* md = MD5P8_Contexts_c(ctx, i);
336 + if (low + md->Nl < low) high++;
340 + if (low + md->totalN < low) high++;
342 + high += md->totalN2;
344 + SIVALu(state, i*16, md->A);
345 + SIVALu(state, i*16 + 4, md->B);
346 + SIVALu(state, i*16 + 8, md->C);
347 + SIVALu(state, i*16 + 12, md->D);
351 + high = (low >> 29) | (high << 3);
356 + if (low - sub > low) high--;
359 + SIVALu(state, 32*4, low);
360 + SIVALu(state, 33*4, high);
364 + MD5_Update(&md, state, 34*4);
365 + MD5_Final(digest, &md);
367 diff --git a/lib/mdigest.h b/lib/mdigest.h
370 @@ -27,3 +27,14 @@ void md5_begin(md_context *ctx);
371 void md5_update(md_context *ctx, const uchar *input, uint32 length);
372 void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN]);
376 + uchar context_storage[1024];
382 +void MD5P8_Init(MD5P8_CTX *ctx);
383 +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length);
384 +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
385 diff --git a/match.c b/match.c
388 @@ -164,6 +164,8 @@ static void hash_search(int f,struct sum_struct *s,
389 if (DEBUG_GTE(DELTASUM, 3))
390 rprintf(FINFO, "sum=%.8x k=%ld\n", sum, (long)k);
392 + checksum2_enable_prefetch(buf, len, s->blength);
394 offset = aligned_offset = aligned_i = 0;
396 end = len + 1 - s->sums[s->count-1].len;
397 @@ -226,7 +228,7 @@ static void hash_search(int f,struct sum_struct *s,
400 map = (schar *)map_ptr(buf,offset,l);
401 - get_checksum2((char *)map,l,sum2);
402 + get_checksum2((char *)map, l, sum2, offset);
406 @@ -268,7 +270,7 @@ static void hash_search(int f,struct sum_struct *s,
407 sum = get_checksum1((char *)map, l);
408 if (sum != s->sums[i].sum1)
410 - get_checksum2((char *)map, l, sum2);
411 + get_checksum2((char *)map, l, sum2, aligned_offset);
412 if (memcmp(sum2, s->sums[i].sum2, s->s2length) != 0)
414 /* OK, we have a re-alignment match. Bump the offset
415 @@ -335,6 +337,8 @@ static void hash_search(int f,struct sum_struct *s,
416 matched(f, s, buf, offset - s->blength, -2);
417 } while (++offset < end);
419 + checksum2_disable_prefetch();
421 matched(f, s, buf, len, -1);
422 map_ptr(buf, len-1, 1);
424 diff --git a/simd-checksum-x86_64.cpp b/simd-checksum-x86_64.cpp
425 --- a/simd-checksum-x86_64.cpp
426 +++ b/simd-checksum-x86_64.cpp
428 * use of the target attribute, selecting the fastest code path based on
429 * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
430 * GCC 4.x are not supported to ease configure.ac logic.
434 + * get_checksum2() is optimized for the case where the selected transfer
435 + * checksum is MD5. MD5 can't be made significantly faster with SIMD
436 + * instructions than the assembly version already included but SIMD
437 + * instructions can be used to hash multiple streams in parallel (see
438 + * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
439 + * block-matching algorithm hashes the blocks independently (in contrast to
440 + * the whole-file checksum) this method can be employed here.
442 + * To prevent needing to modify the core rsync sources significantly, a
443 + * prefetching strategy is used. When a checksum2 is requested, the code
444 + * reads ahead several blocks, creates the MD5 hashes for each block in
445 + * parallel, returns the hash for the first block, and caches the results
446 + * for the other blocks to return in future calls to get_checksum2().
460 #include <immintrin.h>
461 @@ -480,9 +500,235 @@ uint32 get_checksum1(char *buf1, int32 len)
462 return get_checksum1_cpp(buf1, len);
466 +#if !defined(BENCHMARK_SIMD_CHECKSUM1)
468 -#ifdef BENCHMARK_SIMD_CHECKSUM1
469 +// see simd-md5-parallel-x86_64.cpp
470 +extern int md5_parallel_slots();
471 +extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
473 +#endif /* !BENCHMARK_SIMD_CHECKSUM1 */
475 +#if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
477 +#define PREFETCH_ENABLE 1 // debugging
480 +#define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
482 +#define PREFETCH_PRINTF(f_, ...) (void)0;
485 +#define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
486 +#define PREFETCH_MAX_BLOCKS 8
492 + char sum[SUM_LENGTH];
496 + struct map_struct *map;
501 + prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
504 +prefetch_t *prefetch;
506 +extern int xfersum_type;
507 +extern int checksum_seed;
508 +extern int proper_seed_order;
509 +extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
511 +extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
513 +void checksum2_disable_prefetch()
516 + PREFETCH_PRINTF("checksum2_disable_prefetch\n");
522 +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
524 +#ifdef PREFETCH_ENABLE
525 + checksum2_disable_prefetch();
526 + int slots = md5_parallel_slots();
527 + if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
528 + prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
529 + memset(prefetch, 0, sizeof(prefetch_t));
530 + prefetch->map = map;
531 + prefetch->len = len;
532 + prefetch->last = 0;
533 + prefetch->blocklen = blocklen;
534 + prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
535 + PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
540 +static inline void checksum2_reset_prefetch()
542 + for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
543 + prefetch->sums[i].in_use = 0;
547 +static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
549 + if (prefetch->sums[0].in_use) {
550 + if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
551 + memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
552 + for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
553 + prefetch->sums[i] = prefetch->sums[i + 1];
555 + prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
556 + PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
559 + // unexpected access, reset cache
560 + PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
561 + checksum2_reset_prefetch();
567 +static int checksum2_perform_prefetch(OFF_T prefetch_offset)
569 + int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
570 + if (blocks < 2) return 0; // fall through to non-simd, probably faster
574 + for (i = 0; i < blocks; i++) {
575 + prefetch->sums[i].offset = prefetch_offset + total;
576 + prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
577 + prefetch->sums[i].in_use = 0;
578 + total += prefetch->sums[i].len;
580 + for (; i < PREFETCH_MAX_BLOCKS; i++) {
581 + prefetch->sums[i].in_use = 0;
585 + SIVALu(seedbuf, 0, checksum_seed);
587 + PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
588 + char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
589 + char* bufs[PREFETCH_MAX_BLOCKS] = {0};
590 + int lens[PREFETCH_MAX_BLOCKS] = {0};
591 + char* sums[PREFETCH_MAX_BLOCKS] = {0};
592 + for (i = 0; i < blocks; i++) {
593 + bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
594 + lens[i] = prefetch->sums[i].len;
595 + sums[i] = prefetch->sums[i].sum;
597 + if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
598 + for (i = 0; i < blocks; i++) {
599 + prefetch->sums[i].in_use = 1;
603 + // this should never be, abort
604 + PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
605 + checksum2_disable_prefetch();
610 +void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
613 + PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
614 + OFF_T last = prefetch->last;
615 + prefetch->last = prefetch_offset;
616 + if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
617 + // we're looking around trying to align blocks, prefetching will slow things down
618 + PREFETCH_PRINTF("get_checksum2 SEEK\n");
619 + checksum2_reset_prefetch();
620 + } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
623 + } else if (checksum2_perform_prefetch(prefetch_offset)) {
624 + if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
625 + // hit; should always be as we just fetched this data
628 + // this should never be, abort
629 + PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
630 + checksum2_disable_prefetch();
634 + get_checksum2_nosimd(buf, len, sum, prefetch_offset);
636 +#endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
640 +/* Benchmark compilation
642 + The get_checksum1() benchmark runs through all available code paths in a
643 + single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
644 + recompiled for each code path (it always uses the fastest path available
645 + on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
646 + be used when applicable regardless of rsync being built with OpenSSL.
648 + Something like the following should compile and run the benchmarks:
653 + export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
658 + export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
662 + export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
663 + export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
664 + export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
665 + export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
669 + ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
671 + $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
673 + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
674 + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
676 + $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
677 + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
679 + $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
680 + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
682 + ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
684 + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
685 + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
689 + ./bench_csum2.openssl
695 +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
696 #pragma clang optimize off
697 #pragma GCC push_options
698 #pragma GCC optimize ("O0")
699 @@ -493,7 +739,9 @@ uint32 get_checksum1(char *buf1, int32 len)
700 #ifndef CLOCK_MONOTONIC_RAW
701 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
703 +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
705 +#ifdef BENCHMARK_SIMD_CHECKSUM1
706 static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
707 struct timespec start, end;
709 @@ -509,7 +757,7 @@ static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int
710 clock_gettime(CLOCK_MONOTONIC_RAW, &end);
711 us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
712 cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
713 - printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
714 + printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
717 static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
718 @@ -533,10 +781,108 @@ int main() {
722 +#endif /* BENCHMARK_SIMD_CHECKSUM1 */
724 +#ifdef BENCHMARK_SIMD_CHECKSUM2
725 +static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
726 + struct timespec start, end;
728 + unsigned char cs1[16];
729 + unsigned char cs2[16];
732 + clock_gettime(CLOCK_MONOTONIC_RAW, &start);
733 + for (i = 0; i < ROUNDS; i++) {
734 + func(buf, len, (char*)cs1);
736 + clock_gettime(CLOCK_MONOTONIC_RAW, &end);
737 + us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
739 + func2(buf, len, (char*)cs2);
741 + float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
742 + printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
743 + for (i = 0; i < 16; i++) {
744 + printf("%02x", cs1[i] & 0xFF);
747 + for (i = 0; i < 16; i++) {
748 + printf("%02x", cs2[i] & 0xFF);
753 +static void benchmark_inner(char* buf, int32 len, char* sum_out) {
754 + // This should produce the same output for different optimizations
755 + // levels, not the same as sanity_check()
757 + char* bufs[8] = {0};
759 + char* sums[8] = {0};
764 + md5_parallel(1, bufs, lens, sums, NULL, NULL);
768 +extern void MD5P8_Init_c(MD5P8_CTX *ctx);
769 +extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
770 +extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
773 +static void sanity_check(char* buf, int32 len, char* sum_out) {
774 + // This should produce the same output for different optimizations
775 + // levels, not the same as benchmark_inner()
776 + if (md5_parallel_slots() <= 1) {
778 + MD5P8_Init_c(&m5p8);
779 + MD5P8_Update_c(&m5p8, (uchar *)buf, len);
780 + MD5P8_Final_c((uchar *)sum_out, &m5p8);
784 + MD5P8_Update(&m5p8, (uchar *)buf, len);
785 + MD5P8_Final((uchar *)sum_out, &m5p8);
790 + // This benchmarks the parallel MD5 checksum rather than get_checksum2()
791 + // as the latter would require compiling in a lot of rsync's code, but
792 + // it touches all the same internals so the performance should be nearly
796 + char* buf = (char*)malloc(BLOCK_LEN);
797 + for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
799 + const char* method = "?";
800 + switch (md5_parallel_slots()) {
801 + case 8: method = "AVX2"; break;
802 + case 4: method = "SSE2"; break;
804 + case 1: method = "OpenSSL"; break;
805 +#elif (CSUM_CHUNK == 64)
806 + case 1: method = "ASM"; break;
808 + // this won't happen unless you modified code somewhere
809 + case 1: method = "Raw-C"; break;
813 + benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
818 +#endif /* BENCHMARK_SIMD_CHECKSUM2 */
820 +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
821 #pragma GCC pop_options
822 #pragma clang optimize on
823 -#endif /* BENCHMARK_SIMD_CHECKSUM1 */
824 +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
826 #endif /* HAVE_SIMD */
827 #endif /* __cplusplus */
828 diff --git a/simd-md5-parallel-x86_64.cpp b/simd-md5-parallel-x86_64.cpp
831 +++ b/simd-md5-parallel-x86_64.cpp
834 + * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel.
836 + * Original author: Nicolas Noble, 2017
837 + * Modifications: Jorrit Jongma, 2020
839 + * The original code was released in the public domain by the original author,
840 + * falling back to the MIT license ( http://www.opensource.org/licenses/MIT )
841 + * in case public domain does not apply in your country. These modifications
842 + * are likewise released in the public domain, with the same MIT license
845 + * The original publication can be found at:
847 + * https://github.com/nicolasnoble/sse-hash
850 + * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
851 + * MD5 code has been removed and those code paths rerouted to use the MD5
852 + * code already present in rsync, and wrapper functions have been added. The
853 + * MD5P8 code is also new, and is the reason for the new stride parameter.
855 + * This code allows multiple independent MD5 streams to be processed in
856 + * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
857 + * lower than that of the original C routines for MD5, the processing of
858 + * additional streams is "for free".
860 + * Single streams are rerouted to rsync's normal MD5 code as that is faster
861 + * for that case. A further optimization is possible by using SSE2 code on
862 + * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not
863 + * implemented here as it would require some restructuring, and in practise
864 + * the code here is only rarely called with less than the maximum amount of
865 + * streams (typically once at the end of each checksum2'd file).
867 + * Benchmarks (in MB/s) C ASM SSE2*1 SSE2*4 AVX2*1 AVX2*8
868 + * - Intel Atom D2700 302 334 166 664 N/A N/A
869 + * - Intel i7-7700hq 351 376 289 1156 273 2184
870 + * - AMD ThreadRipper 2950x 728 784 568 2272 430 3440
884 +#ifndef BENCHMARK_SIMD_CHECKSUM2
885 +#define PMD5_ALLOW_SSE2 // debugging
886 +#define PMD5_ALLOW_AVX2 // debugging
889 +#ifdef PMD5_ALLOW_AVX2
890 +#ifndef PMD5_ALLOW_SSE2
891 +#define PMD5_ALLOW_SSE2
898 +#include <immintrin.h>
900 +/* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
904 +#define MVSTATIC static
907 +// Missing from the headers on gcc 6 and older, clang 8 and older
908 +typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
909 +typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
911 +#define PMD5_SLOTS_DEFAULT 0
912 +#define PMD5_SLOTS_SSE2 4
913 +#define PMD5_SLOTS_AVX2 8
914 +#define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2
916 +#ifdef PMD5_ALLOW_SSE2
917 +__attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots()
919 + return PMD5_SLOTS_SSE2;
923 +#ifdef PMD5_ALLOW_AVX2
924 +__attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots()
926 + return PMD5_SLOTS_AVX2;
930 +__attribute__ ((target("default"))) MVSTATIC int pmd5_slots()
932 + return PMD5_SLOTS_DEFAULT;
935 +/* The parallel MD5 context structure. */
937 + __m128i state_sse2[4];
938 + __m256i state_avx2[4];
939 + uint64_t len[PMD5_SLOTS_MAX];
942 +/* The status returned by the various functions below. */
946 + PMD5_UNALIGNED_UPDATE,
949 +/* Initializes all slots in the given pmd5 context. */
950 +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx);
952 +/* Initializes a single slot out in the given pmd5 context. */
953 +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot);
955 +/* Makes an MD5 update on all slots in parallel, given the same exact length on all streams.
956 + The stream pointers will be incremented accordingly.
957 + It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
958 + The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted.
959 + Stride defaults to 64 if 0 is passed. */
960 +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride);
962 +/* Makes an MD5 update on all slots in parallel, given different lengths.
963 + The stream pointers will be incremented accordingly.
964 + The lengths will be decreased accordingly. Not all data might be consumed.
965 + It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
966 + The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */
967 +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]);
969 +/* Finishes all slots at once. Fills in all digests. */
970 +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]);
972 +/* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
973 + a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
974 +static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot);
976 +/* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a
977 + multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using
978 + a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
979 +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length);
981 +/* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */
982 +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot);
984 +/* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */
985 +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot);
1004 +#define T1 0xD76AA478
1005 +#define T2 0xE8C7B756
1006 +#define T3 0x242070DB
1007 +#define T4 0xC1BDCEEE
1008 +#define T5 0xF57C0FAF
1009 +#define T6 0x4787C62A
1010 +#define T7 0xA8304613
1011 +#define T8 0xFD469501
1012 +#define T9 0x698098D8
1013 +#define T10 0x8B44F7AF
1014 +#define T11 0xFFFF5BB1
1015 +#define T12 0x895CD7BE
1016 +#define T13 0x6B901122
1017 +#define T14 0xFD987193
1018 +#define T15 0xA679438E
1019 +#define T16 0x49B40821
1020 +#define T17 0xF61E2562
1021 +#define T18 0xC040B340
1022 +#define T19 0x265E5A51
1023 +#define T20 0xE9B6C7AA
1024 +#define T21 0xD62F105D
1025 +#define T22 0x02441453
1026 +#define T23 0xD8A1E681
1027 +#define T24 0xE7D3FBC8
1028 +#define T25 0x21E1CDE6
1029 +#define T26 0xC33707D6
1030 +#define T27 0xF4D50D87
1031 +#define T28 0x455A14ED
1032 +#define T29 0xA9E3E905
1033 +#define T30 0xFCEFA3F8
1034 +#define T31 0x676F02D9
1035 +#define T32 0x8D2A4C8A
1036 +#define T33 0xFFFA3942
1037 +#define T34 0x8771F681
1038 +#define T35 0x6D9D6122
1039 +#define T36 0xFDE5380C
1040 +#define T37 0xA4BEEA44
1041 +#define T38 0x4BDECFA9
1042 +#define T39 0xF6BB4B60
1043 +#define T40 0xBEBFBC70
1044 +#define T41 0x289B7EC6
1045 +#define T42 0xEAA127FA
1046 +#define T43 0xD4EF3085
1047 +#define T44 0x04881D05
1048 +#define T45 0xD9D4D039
1049 +#define T46 0xE6DB99E5
1050 +#define T47 0x1FA27CF8
1051 +#define T48 0xC4AC5665
1052 +#define T49 0xF4292244
1053 +#define T50 0x432AFF97
1054 +#define T51 0xAB9423A7
1055 +#define T52 0xFC93A039
1056 +#define T53 0x655B59C3
1057 +#define T54 0x8F0CCC92
1058 +#define T55 0xFFEFF47D
1059 +#define T56 0x85845DD1
1060 +#define T57 0x6FA87E4F
1061 +#define T58 0xFE2CE6E0
1062 +#define T59 0xA3014314
1063 +#define T60 0x4E0811A1
1064 +#define T61 0xF7537E82
1065 +#define T62 0xBD3AF235
1066 +#define T63 0x2AD7D2BB
1067 +#define T64 0xEB86D391
1069 +#define ROTL_SSE2(x, n) { \
1071 + s = _mm_srli_epi32(x, 32 - n); \
1072 + x = _mm_slli_epi32(x, n); \
1073 + x = _mm_or_si128(x, s); \
1076 +#define ROTL_AVX2(x, n) { \
1078 + s = _mm256_srli_epi32(x, 32 - n); \
1079 + x = _mm256_slli_epi32(x, n); \
1080 + x = _mm256_or_si256(x, s); \
1083 +#define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z))
1084 +#define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y))
1085 +#define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z)
1086 +#define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff))))
1088 +#define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
1089 +#define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y))
1090 +#define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z)
1091 +#define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff))))
1093 +#define SET_SSE2(step, a, b, c, d, x, s, ac) { \
1094 + a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \
1095 + ROTL_SSE2(a, s); \
1096 + a = _mm_add_epi32(a, b); \
1099 +#define SET_AVX2(step, a, b, c, d, x, s, ac) { \
1100 + a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \
1101 + ROTL_AVX2(a, s); \
1102 + a = _mm256_add_epi32(a, b); \
1105 +#define IA 0x67452301
1106 +#define IB 0xefcdab89
1107 +#define IC 0x98badcfe
1108 +#define ID 0x10325476
1110 +#define GET_MD5_DATA(dest, src, pos) \
1112 + ((uint32_t) src[pos + 0]) << 0 | \
1113 + ((uint32_t) src[pos + 1]) << 8 | \
1114 + ((uint32_t) src[pos + 2]) << 16 | \
1115 + ((uint32_t) src[pos + 3]) << 24
1117 +#define GET_PMD5_DATA_SSE2(dest, src, pos) { \
1118 + uint32_t v0, v1, v2, v3; \
1119 + GET_MD5_DATA(v0, src[0], pos); \
1120 + GET_MD5_DATA(v1, src[1], pos); \
1121 + GET_MD5_DATA(v2, src[2], pos); \
1122 + GET_MD5_DATA(v3, src[3], pos); \
1123 + dest = _mm_setr_epi32(v0, v1, v2, v3); \
1126 +#define GET_PMD5_DATA_AVX2(dest, src, pos) { \
1127 + uint32_t v0, v1, v2, v3; \
1128 + uint32_t v4, v5, v6, v7; \
1129 + GET_MD5_DATA(v0, src[0], pos); \
1130 + GET_MD5_DATA(v1, src[1], pos); \
1131 + GET_MD5_DATA(v2, src[2], pos); \
1132 + GET_MD5_DATA(v3, src[3], pos); \
1133 + GET_MD5_DATA(v4, src[4], pos); \
1134 + GET_MD5_DATA(v5, src[5], pos); \
1135 + GET_MD5_DATA(v6, src[6], pos); \
1136 + GET_MD5_DATA(v7, src[7], pos); \
1137 + dest = _mm256_setr_epi32(v0, v1, v2, v3, \
1138 + v4, v5, v6, v7); \
1141 +#define PUT_MD5_DATA(dest, val, pos) { \
1142 + dest[pos + 0] = (val >> 0) & 0xff; \
1143 + dest[pos + 1] = (val >> 8) & 0xff; \
1144 + dest[pos + 2] = (val >> 16) & 0xff; \
1145 + dest[pos + 3] = (val >> 24) & 0xff; \
1148 +const static uint8_t md5_padding[64] = {
1149 + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1150 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1151 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1152 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1153 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1154 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1155 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1156 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1159 +#ifdef PMD5_ALLOW_SSE2
1160 +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
1163 + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1167 + ctx->state_sse2[0] = _mm_set1_epi32(IA);
1168 + ctx->state_sse2[1] = _mm_set1_epi32(IB);
1169 + ctx->state_sse2[2] = _mm_set1_epi32(IC);
1170 + ctx->state_sse2[3] = _mm_set1_epi32(ID);
1172 + return PMD5_SUCCESS;
1176 +#ifdef PMD5_ALLOW_AVX2
1177 +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
1180 + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1184 + ctx->state_avx2[0] = _mm256_set1_epi32(IA);
1185 + ctx->state_avx2[1] = _mm256_set1_epi32(IB);
1186 + ctx->state_avx2[2] = _mm256_set1_epi32(IC);
1187 + ctx->state_avx2[3] = _mm256_set1_epi32(ID);
1189 + return PMD5_SUCCESS;
1193 +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
1195 + return PMD5_INVALID_SLOT;
1198 +#ifdef PMD5_ALLOW_SSE2
1199 +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1201 + if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
1202 + return PMD5_INVALID_SLOT;
1204 + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
1207 + for (i = 0; i < 4; i++) {
1208 + _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
1216 + for (i = 0; i < 4; i++) {
1217 + ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]);
1220 + return PMD5_SUCCESS;
1224 +#ifdef PMD5_ALLOW_AVX2
1225 +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1227 + if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
1228 + return PMD5_INVALID_SLOT;
1230 + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
1233 + for (i = 0; i < 4; i++) {
1234 + _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
1242 + for (i = 0; i < 4; i++) {
1243 + ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]);
1246 + return PMD5_SUCCESS;
1250 +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1252 + return PMD5_INVALID_SLOT;
1255 +#ifdef PMD5_ALLOW_SSE2
1256 +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
1258 + if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
1259 + return PMD5_INVALID_SLOT;
1261 + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
1264 + for (i = 0; i < 4; i++) {
1265 + _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
1273 + return PMD5_SUCCESS;
1277 +#ifdef PMD5_ALLOW_AVX2
1278 +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
1280 + if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
1281 + return PMD5_INVALID_SLOT;
1283 + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
1286 + for (i = 0; i < 4; i++) {
1287 + _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
1295 + return PMD5_SUCCESS;
1299 +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
1301 + return PMD5_INVALID_SLOT;
1304 +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot)
1306 + return pmd5_set_slot(ctx, slot, IA, IB, IC, ID);
1309 +#ifdef PMD5_ALLOW_SSE2
1310 +__attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
1312 + __m128i W[MD5_DIGEST_LEN], a, b, c, d;
1314 + GET_PMD5_DATA_SSE2(W[ 0], data, 0);
1315 + GET_PMD5_DATA_SSE2(W[ 1], data, 4);
1316 + GET_PMD5_DATA_SSE2(W[ 2], data, 8);
1317 + GET_PMD5_DATA_SSE2(W[ 3], data, 12);
1318 + GET_PMD5_DATA_SSE2(W[ 4], data, 16);
1319 + GET_PMD5_DATA_SSE2(W[ 5], data, 20);
1320 + GET_PMD5_DATA_SSE2(W[ 6], data, 24);
1321 + GET_PMD5_DATA_SSE2(W[ 7], data, 28);
1322 + GET_PMD5_DATA_SSE2(W[ 8], data, 32);
1323 + GET_PMD5_DATA_SSE2(W[ 9], data, 36);
1324 + GET_PMD5_DATA_SSE2(W[10], data, 40);
1325 + GET_PMD5_DATA_SSE2(W[11], data, 44);
1326 + GET_PMD5_DATA_SSE2(W[12], data, 48);
1327 + GET_PMD5_DATA_SSE2(W[13], data, 52);
1328 + GET_PMD5_DATA_SSE2(W[14], data, 56);
1329 + GET_PMD5_DATA_SSE2(W[15], data, 60);
1331 + a = ctx->state_sse2[0];
1332 + b = ctx->state_sse2[1];
1333 + c = ctx->state_sse2[2];
1334 + d = ctx->state_sse2[3];
1336 + SET_SSE2(F, a, b, c, d, W[ 0], S11, 1);
1337 + SET_SSE2(F, d, a, b, c, W[ 1], S12, 2);
1338 + SET_SSE2(F, c, d, a, b, W[ 2], S13, 3);
1339 + SET_SSE2(F, b, c, d, a, W[ 3], S14, 4);
1340 + SET_SSE2(F, a, b, c, d, W[ 4], S11, 5);
1341 + SET_SSE2(F, d, a, b, c, W[ 5], S12, 6);
1342 + SET_SSE2(F, c, d, a, b, W[ 6], S13, 7);
1343 + SET_SSE2(F, b, c, d, a, W[ 7], S14, 8);
1344 + SET_SSE2(F, a, b, c, d, W[ 8], S11, 9);
1345 + SET_SSE2(F, d, a, b, c, W[ 9], S12, 10);
1346 + SET_SSE2(F, c, d, a, b, W[10], S13, 11);
1347 + SET_SSE2(F, b, c, d, a, W[11], S14, 12);
1348 + SET_SSE2(F, a, b, c, d, W[12], S11, 13);
1349 + SET_SSE2(F, d, a, b, c, W[13], S12, 14);
1350 + SET_SSE2(F, c, d, a, b, W[14], S13, 15);
1351 + SET_SSE2(F, b, c, d, a, W[15], S14, 16);
1353 + SET_SSE2(G, a, b, c, d, W[ 1], S21, 17);
1354 + SET_SSE2(G, d, a, b, c, W[ 6], S22, 18);
1355 + SET_SSE2(G, c, d, a, b, W[11], S23, 19);
1356 + SET_SSE2(G, b, c, d, a, W[ 0], S24, 20);
1357 + SET_SSE2(G, a, b, c, d, W[ 5], S21, 21);
1358 + SET_SSE2(G, d, a, b, c, W[10], S22, 22);
1359 + SET_SSE2(G, c, d, a, b, W[15], S23, 23);
1360 + SET_SSE2(G, b, c, d, a, W[ 4], S24, 24);
1361 + SET_SSE2(G, a, b, c, d, W[ 9], S21, 25);
1362 + SET_SSE2(G, d, a, b, c, W[14], S22, 26);
1363 + SET_SSE2(G, c, d, a, b, W[ 3], S23, 27);
1364 + SET_SSE2(G, b, c, d, a, W[ 8], S24, 28);
1365 + SET_SSE2(G, a, b, c, d, W[13], S21, 29);
1366 + SET_SSE2(G, d, a, b, c, W[ 2], S22, 30);
1367 + SET_SSE2(G, c, d, a, b, W[ 7], S23, 31);
1368 + SET_SSE2(G, b, c, d, a, W[12], S24, 32);
1370 + SET_SSE2(H, a, b, c, d, W[ 5], S31, 33);
1371 + SET_SSE2(H, d, a, b, c, W[ 8], S32, 34);
1372 + SET_SSE2(H, c, d, a, b, W[11], S33, 35);
1373 + SET_SSE2(H, b, c, d, a, W[14], S34, 36);
1374 + SET_SSE2(H, a, b, c, d, W[ 1], S31, 37);
1375 + SET_SSE2(H, d, a, b, c, W[ 4], S32, 38);
1376 + SET_SSE2(H, c, d, a, b, W[ 7], S33, 39);
1377 + SET_SSE2(H, b, c, d, a, W[10], S34, 40);
1378 + SET_SSE2(H, a, b, c, d, W[13], S31, 41);
1379 + SET_SSE2(H, d, a, b, c, W[ 0], S32, 42);
1380 + SET_SSE2(H, c, d, a, b, W[ 3], S33, 43);
1381 + SET_SSE2(H, b, c, d, a, W[ 6], S34, 44);
1382 + SET_SSE2(H, a, b, c, d, W[ 9], S31, 45);
1383 + SET_SSE2(H, d, a, b, c, W[12], S32, 46);
1384 + SET_SSE2(H, c, d, a, b, W[15], S33, 47);
1385 + SET_SSE2(H, b, c, d, a, W[ 2], S34, 48);
1387 + SET_SSE2(I, a, b, c, d, W[ 0], S41, 49);
1388 + SET_SSE2(I, d, a, b, c, W[ 7], S42, 50);
1389 + SET_SSE2(I, c, d, a, b, W[14], S43, 51);
1390 + SET_SSE2(I, b, c, d, a, W[ 5], S44, 52);
1391 + SET_SSE2(I, a, b, c, d, W[12], S41, 53);
1392 + SET_SSE2(I, d, a, b, c, W[ 3], S42, 54);
1393 + SET_SSE2(I, c, d, a, b, W[10], S43, 55);
1394 + SET_SSE2(I, b, c, d, a, W[ 1], S44, 56);
1395 + SET_SSE2(I, a, b, c, d, W[ 8], S41, 57);
1396 + SET_SSE2(I, d, a, b, c, W[15], S42, 58);
1397 + SET_SSE2(I, c, d, a, b, W[ 6], S43, 59);
1398 + SET_SSE2(I, b, c, d, a, W[13], S44, 60);
1399 + SET_SSE2(I, a, b, c, d, W[ 4], S41, 61);
1400 + SET_SSE2(I, d, a, b, c, W[11], S42, 62);
1401 + SET_SSE2(I, c, d, a, b, W[ 2], S43, 63);
1402 + SET_SSE2(I, b, c, d, a, W[ 9], S44, 64);
1404 + ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a);
1405 + ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b);
1406 + ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c);
1407 + ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d);
1411 +#ifdef PMD5_ALLOW_AVX2
1412 +__attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
1414 + __m256i W[MD5_DIGEST_LEN], a, b, c, d;
1416 + GET_PMD5_DATA_AVX2(W[ 0], data, 0);
1417 + GET_PMD5_DATA_AVX2(W[ 1], data, 4);
1418 + GET_PMD5_DATA_AVX2(W[ 2], data, 8);
1419 + GET_PMD5_DATA_AVX2(W[ 3], data, 12);
1420 + GET_PMD5_DATA_AVX2(W[ 4], data, 16);
1421 + GET_PMD5_DATA_AVX2(W[ 5], data, 20);
1422 + GET_PMD5_DATA_AVX2(W[ 6], data, 24);
1423 + GET_PMD5_DATA_AVX2(W[ 7], data, 28);
1424 + GET_PMD5_DATA_AVX2(W[ 8], data, 32);
1425 + GET_PMD5_DATA_AVX2(W[ 9], data, 36);
1426 + GET_PMD5_DATA_AVX2(W[10], data, 40);
1427 + GET_PMD5_DATA_AVX2(W[11], data, 44);
1428 + GET_PMD5_DATA_AVX2(W[12], data, 48);
1429 + GET_PMD5_DATA_AVX2(W[13], data, 52);
1430 + GET_PMD5_DATA_AVX2(W[14], data, 56);
1431 + GET_PMD5_DATA_AVX2(W[15], data, 60);
1433 + a = ctx->state_avx2[0];
1434 + b = ctx->state_avx2[1];
1435 + c = ctx->state_avx2[2];
1436 + d = ctx->state_avx2[3];
1438 + SET_AVX2(F, a, b, c, d, W[ 0], S11, 1);
1439 + SET_AVX2(F, d, a, b, c, W[ 1], S12, 2);
1440 + SET_AVX2(F, c, d, a, b, W[ 2], S13, 3);
1441 + SET_AVX2(F, b, c, d, a, W[ 3], S14, 4);
1442 + SET_AVX2(F, a, b, c, d, W[ 4], S11, 5);
1443 + SET_AVX2(F, d, a, b, c, W[ 5], S12, 6);
1444 + SET_AVX2(F, c, d, a, b, W[ 6], S13, 7);
1445 + SET_AVX2(F, b, c, d, a, W[ 7], S14, 8);
1446 + SET_AVX2(F, a, b, c, d, W[ 8], S11, 9);
1447 + SET_AVX2(F, d, a, b, c, W[ 9], S12, 10);
1448 + SET_AVX2(F, c, d, a, b, W[10], S13, 11);
1449 + SET_AVX2(F, b, c, d, a, W[11], S14, 12);
1450 + SET_AVX2(F, a, b, c, d, W[12], S11, 13);
1451 + SET_AVX2(F, d, a, b, c, W[13], S12, 14);
1452 + SET_AVX2(F, c, d, a, b, W[14], S13, 15);
1453 + SET_AVX2(F, b, c, d, a, W[15], S14, 16);
1455 + SET_AVX2(G, a, b, c, d, W[ 1], S21, 17);
1456 + SET_AVX2(G, d, a, b, c, W[ 6], S22, 18);
1457 + SET_AVX2(G, c, d, a, b, W[11], S23, 19);
1458 + SET_AVX2(G, b, c, d, a, W[ 0], S24, 20);
1459 + SET_AVX2(G, a, b, c, d, W[ 5], S21, 21);
1460 + SET_AVX2(G, d, a, b, c, W[10], S22, 22);
1461 + SET_AVX2(G, c, d, a, b, W[15], S23, 23);
1462 + SET_AVX2(G, b, c, d, a, W[ 4], S24, 24);
1463 + SET_AVX2(G, a, b, c, d, W[ 9], S21, 25);
1464 + SET_AVX2(G, d, a, b, c, W[14], S22, 26);
1465 + SET_AVX2(G, c, d, a, b, W[ 3], S23, 27);
1466 + SET_AVX2(G, b, c, d, a, W[ 8], S24, 28);
1467 + SET_AVX2(G, a, b, c, d, W[13], S21, 29);
1468 + SET_AVX2(G, d, a, b, c, W[ 2], S22, 30);
1469 + SET_AVX2(G, c, d, a, b, W[ 7], S23, 31);
1470 + SET_AVX2(G, b, c, d, a, W[12], S24, 32);
1472 + SET_AVX2(H, a, b, c, d, W[ 5], S31, 33);
1473 + SET_AVX2(H, d, a, b, c, W[ 8], S32, 34);
1474 + SET_AVX2(H, c, d, a, b, W[11], S33, 35);
1475 + SET_AVX2(H, b, c, d, a, W[14], S34, 36);
1476 + SET_AVX2(H, a, b, c, d, W[ 1], S31, 37);
1477 + SET_AVX2(H, d, a, b, c, W[ 4], S32, 38);
1478 + SET_AVX2(H, c, d, a, b, W[ 7], S33, 39);
1479 + SET_AVX2(H, b, c, d, a, W[10], S34, 40);
1480 + SET_AVX2(H, a, b, c, d, W[13], S31, 41);
1481 + SET_AVX2(H, d, a, b, c, W[ 0], S32, 42);
1482 + SET_AVX2(H, c, d, a, b, W[ 3], S33, 43);
1483 + SET_AVX2(H, b, c, d, a, W[ 6], S34, 44);
1484 + SET_AVX2(H, a, b, c, d, W[ 9], S31, 45);
1485 + SET_AVX2(H, d, a, b, c, W[12], S32, 46);
1486 + SET_AVX2(H, c, d, a, b, W[15], S33, 47);
1487 + SET_AVX2(H, b, c, d, a, W[ 2], S34, 48);
1489 + SET_AVX2(I, a, b, c, d, W[ 0], S41, 49);
1490 + SET_AVX2(I, d, a, b, c, W[ 7], S42, 50);
1491 + SET_AVX2(I, c, d, a, b, W[14], S43, 51);
1492 + SET_AVX2(I, b, c, d, a, W[ 5], S44, 52);
1493 + SET_AVX2(I, a, b, c, d, W[12], S41, 53);
1494 + SET_AVX2(I, d, a, b, c, W[ 3], S42, 54);
1495 + SET_AVX2(I, c, d, a, b, W[10], S43, 55);
1496 + SET_AVX2(I, b, c, d, a, W[ 1], S44, 56);
1497 + SET_AVX2(I, a, b, c, d, W[ 8], S41, 57);
1498 + SET_AVX2(I, d, a, b, c, W[15], S42, 58);
1499 + SET_AVX2(I, c, d, a, b, W[ 6], S43, 59);
1500 + SET_AVX2(I, b, c, d, a, W[13], S44, 60);
1501 + SET_AVX2(I, a, b, c, d, W[ 4], S41, 61);
1502 + SET_AVX2(I, d, a, b, c, W[11], S42, 62);
1503 + SET_AVX2(I, c, d, a, b, W[ 2], S43, 63);
1504 + SET_AVX2(I, b, c, d, a, W[ 9], S44, 64);
1506 + ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a);
1507 + ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b);
1508 + ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c);
1509 + ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d);
1513 +__attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
1517 +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride)
1519 + const uint8_t * ptrs[PMD5_SLOTS_MAX];
1521 + if (!length) return PMD5_SUCCESS;
1523 + int slots = pmd5_slots();
1525 + if (!stride) stride = 64;
1528 + for (i = 0; i < slots; i++) {
1529 + ptrs[i] = data[i];
1530 + ctx->len[i] += length;
1531 + if (!ptrs[i]) ptrs[i] = md5_padding;
1534 + while (length >= 64) {
1535 + pmd5_process(ctx, ptrs);
1537 + for (i = 0; i < slots; i++) {
1538 + if (data[i]) ptrs[i] += stride;
1542 + if (length) return PMD5_UNALIGNED_UPDATE;
1544 + for (i = 0; i < slots; i++) {
1545 + if (data[i]) data[i] = ptrs[i];
1548 + return PMD5_SUCCESS;
1551 +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX])
1553 + uint64_t length = 0;
1554 + int slots = pmd5_slots();
1557 + for (i = 0; i < slots; i++) {
1558 + if ((length == 0) || (lengths[i] < length)) length = lengths[i];
1561 + for (i = 0; i < slots; i++) {
1562 + lengths[i] -= length;
1565 + return pmd5_update_all_simple(ctx, data, length, 0);
1568 +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length)
1572 + if ((slot >= pmd5_slots()) || (slot < 0))
1573 + return PMD5_INVALID_SLOT;
1575 + pmd5_to_md5(pctx, &ctx, slot);
1576 + if (data && length) {
1577 + MD5_Update(&ctx, data, length);
1579 + MD5_Final(digest, &ctx);
1581 + return PMD5_SUCCESS;
1584 +static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot)
1586 + return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0);
1589 +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN])
1592 + for (i = 0; i < pmd5_slots(); i++) {
1593 + pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0);
1595 + return PMD5_SUCCESS;
1598 +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot)
1600 + if ((slot >= pmd5_slots()) || (slot < 0))
1601 + return PMD5_INVALID_SLOT;
1603 + // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting.
1606 + pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29);
1608 + pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32);
1610 + return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D);
1613 +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot)
1615 + if ((slot >= pmd5_slots()) || (slot < 0))
1616 + return PMD5_INVALID_SLOT;
1621 + ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF;
1622 + ctx->Nh = pctx->len[slot] >> 29;
1624 + uint32_t a, b, c, d;
1625 + pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
1626 + if (ret == PMD5_SUCCESS) {
1634 + ctx->totalN = pctx->len[slot] & 0xFFFFFFFF;
1635 + ctx->totalN2 = pctx->len[slot] >> 32;
1636 + return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D);
1640 +/* With GCC 10 putting these implementations inside 'extern "C"' causes an
1641 + assembler error. That worked fine on GCC 5-9 and clang 6-10...
1644 +static inline int md5_parallel_slots_cpp()
1646 + int slots = pmd5_slots();
1647 + if (slots == 0) return 1;
1651 +static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
1653 + int slots = md5_parallel_slots_cpp();
1654 + if ((streams < 1) || (streams > slots)) return 0;
1655 + if (pre4 && post4) return 0;
1661 + MD5_Update(&ctx, (const unsigned char*)pre4, 4);
1663 + MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]);
1665 + MD5_Update(&ctx, (const unsigned char*)post4, 4);
1668 + MD5_Final((uint8_t*)sum[0], &ctx);
1674 + int active[PMD5_SLOTS_MAX];
1675 + char* buffers[PMD5_SLOTS_MAX];
1676 + uint64_t left[PMD5_SLOTS_MAX];
1677 + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1678 + active[i] = streams > i;
1679 + if (i < streams) {
1680 + buffers[i] = buf[i];
1681 + left[i] = (uint64_t)len[i];
1683 + buffers[i] = NULL;
1687 + MD5_CTX results[PMD5_SLOTS_MAX];
1689 + pmd5_context ctx_simd;
1690 + if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0;
1693 + char temp_buffers[PMD5_SLOTS_MAX][64];
1695 + for (i = 0; i < slots; i++) {
1697 + if (left[i] < 60) {
1698 + MD5_Init(&results[i]);
1699 + MD5_Update(&results[i], (const unsigned char*)pre4, 4);
1700 + MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]);
1704 + memcpy(temp_buffers[i], pre4, 4);
1705 + memcpy(temp_buffers[i] + 4, buffers[i], 60);
1714 + char* ptrs[PMD5_SLOTS_MAX];
1715 + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1716 + ptrs[i] = &temp_buffers[i][0];
1718 + if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) {
1726 + for (i = 0; i < slots; i++) {
1727 + if (active[i] && (left[i] < 64)) {
1728 + if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) {
1735 + uint64_t shortest = 0;
1736 + for (i = 0; i < slots; i++) {
1738 + buffers[i] = NULL;
1739 + } else if ((shortest == 0) || (left[i] < shortest)) {
1740 + shortest = left[i];
1744 + if (shortest > 0) {
1745 + shortest = shortest & ~63;
1746 + if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) {
1749 + for (i = 0; i < slots; i++) {
1751 + left[i] -= shortest;
1760 + for (i = 0; i < slots; i++) {
1761 + have_any |= active[i];
1769 + for (i = 0; i < slots; i++) {
1770 + if (i < streams) {
1771 + if (left[i] > 0) {
1772 + // buffer[i] == NULL here
1773 + MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]);
1776 + MD5_Update(&results[i], (const unsigned char*)post4, 4);
1779 + MD5_Final((uint8_t*)sum[i], &results[i]);
1787 +// each pmd5_context needs to be 32-byte aligned
1788 +#define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31)))
1790 +static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx)
1793 + for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) {
1794 + pmd5_init_all(MD5P8_Contexts_simd(ctx, i));
1800 +static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length)
1802 + int slots = pmd5_slots();
1805 + if ((ctx->used) || (length < 512)) {
1806 + int cpy = MIN(length, 512 - ctx->used);
1807 + memcpy(&ctx->buffer[ctx->used], input, cpy);
1812 + if (ctx->used == 512) {
1813 + if (slots == PMD5_SLOTS_AVX2) {
1814 + const uint8_t* ptrs[PMD5_SLOTS_MAX] = {
1815 + (uint8_t*)ctx->buffer,
1816 + (uint8_t*)(ctx->buffer + 64),
1817 + (uint8_t*)(ctx->buffer + 128),
1818 + (uint8_t*)(ctx->buffer + 192),
1819 + (uint8_t*)(ctx->buffer + 256),
1820 + (uint8_t*)(ctx->buffer + 320),
1821 + (uint8_t*)(ctx->buffer + 384),
1822 + (uint8_t*)(ctx->buffer + 448)
1824 + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0);
1826 + const uint8_t* ptrs1[PMD5_SLOTS_MAX] = {
1827 + (uint8_t*)ctx->buffer,
1828 + (uint8_t*)(ctx->buffer + 64),
1829 + (uint8_t*)(ctx->buffer + 128),
1830 + (uint8_t*)(ctx->buffer + 192)
1832 + const uint8_t* ptrs2[PMD5_SLOTS_MAX] = {
1833 + (uint8_t*)(ctx->buffer + 256),
1834 + (uint8_t*)(ctx->buffer + 320),
1835 + (uint8_t*)(ctx->buffer + 384),
1836 + (uint8_t*)(ctx->buffer + 448)
1838 + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0);
1839 + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0);
1845 + if (length >= 512) {
1846 + uint32 blocks = length / 512;
1847 + if (slots == PMD5_SLOTS_AVX2) {
1848 + const uint8_t* ptrs[8] = {
1849 + (uint8_t*)(input + pos),
1850 + (uint8_t*)(input + pos + 64),
1851 + (uint8_t*)(input + pos + 128),
1852 + (uint8_t*)(input + pos + 192),
1853 + (uint8_t*)(input + pos + 256),
1854 + (uint8_t*)(input + pos + 320),
1855 + (uint8_t*)(input + pos + 384),
1856 + (uint8_t*)(input + pos + 448)
1858 + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512);
1860 + const uint8_t* ptrs1[4] = {
1861 + (uint8_t*)(input + pos),
1862 + (uint8_t*)(input + pos + 64),
1863 + (uint8_t*)(input + pos + 128),
1864 + (uint8_t*)(input + pos + 192)
1866 + const uint8_t* ptrs2[4] = {
1867 + (uint8_t*)(input + pos + 256),
1868 + (uint8_t*)(input + pos + 320),
1869 + (uint8_t*)(input + pos + 384),
1870 + (uint8_t*)(input + pos + 448)
1872 + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512);
1873 + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512);
1875 + pos += blocks * 512;
1876 + length -= blocks * 512;
1880 + memcpy(ctx->buffer, &input[pos], length);
1881 + ctx->used = length;
1885 +static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
1888 + uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0;
1891 + memset(tmp, 0, 512);
1892 + MD5P8_Update(ctx, tmp, 512 - ctx->used);
1895 + uchar state[34*4] = {0};
1898 + for (i = 0; i < 8; i++) {
1899 + if (pmd5_slots() == PMD5_SLOTS_AVX2) {
1900 + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
1901 + } else if (i < 4) {
1902 + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
1904 + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4);
1907 + if (low + tmp.Nl < low) high++;
1911 + if (low + tmp.totalN < low) high++;
1912 + low += tmp.totalN;
1913 + high += tmp.totalN2;
1915 + SIVALu(state, i*16, tmp.A);
1916 + SIVALu(state, i*16 + 4, tmp.B);
1917 + SIVALu(state, i*16 + 8, tmp.C);
1918 + SIVALu(state, i*16 + 12, tmp.D);
1921 +#ifndef USE_OPENSSL
1922 + high = (low >> 29) | (high << 3);
1927 + if (low - sub > low) high--;
1930 + SIVALu(state, 32*4, low);
1931 + SIVALu(state, 33*4, high);
1935 + MD5_Update(&md, state, 34*4);
1936 + MD5_Final(digest, &md);
1941 +int md5_parallel_slots()
1943 + return md5_parallel_slots_cpp();
1946 +int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
1948 + return md5_parallel_cpp(streams, buf, len, sum, pre4, post4);
1951 +void MD5P8_Init(MD5P8_CTX *ctx)
1953 + MD5P8_Init_cpp(ctx);
1956 +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
1958 + MD5P8_Update_cpp(ctx, input, length);
1961 +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
1963 + MD5P8_Final_cpp(digest, ctx);
1968 +#endif /* HAVE_SIMD */
1969 +#endif /* __cplusplus */
1970 +#endif /* __x86_64__ */