simd-checksum-x86_64.cpp

   1 /*
   2  * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
   3  *
   4  * Copyright (C) 1996 Andrew Tridgell
   5  * Copyright (C) 1996 Paul Mackerras
   6  * Copyright (C) 2004-2020 Wayne Davison
   7  * Copyright (C) 2020 Jorrit Jongma
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 3 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, visit the http://fsf.org website.
  21  */
  22 /*
  23  * Optimization target for get_checksum1() was the Intel Atom D2700, the
  24  * slowest CPU in the test set and the most likely to be CPU limited during
  25  * transfers. The combination of intrinsics was chosen specifically for the
  26  * most gain on that CPU, other combinations were occasionally slightly
  27  * faster on the others.
  28  *
  29  * While on more modern CPUs transfers are less likely to be CPU limited
  30  * (at least by this specific function), lower CPU usage is always better.
  31  * Improvements may still be seen when matching chunks from NVMe storage
  32  * even on newer CPUs.
  33  *
  34  * Benchmarks (in MB/s)            C    SSE2   SSSE3    AVX2
  35  * - Intel Atom D2700            550     750    1000     N/A
  36  * - Intel i7-7700hq            1850    2550    4050    6200
  37  * - AMD ThreadRipper 2950x     2900    5600    8950    8100
  38  *
  39  * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
  40  * significantly faster. AVX2 is kept because it's more likely to relieve
  41  * the bottleneck on the slower CPU.
  42  *
  43  * This optimization for get_checksum1() is intentionally limited to x86-64
  44  * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
  45  * the available xmm registers, this optimized version may not be faster than
  46  * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
  47  *
  48  * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
  49  * use of the target attribute, selecting the fastest code path based on
  50  * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
  51  * GCC 4.x are not supported to ease configure.ac logic.
  52  */
  53
  54 #ifdef __x86_64__
  55 #ifdef __cplusplus
  56
  57 #include "rsync.h"
  58
  59 #ifdef HAVE_SIMD
  60
  61 #include <immintrin.h>
  62
  63 /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
  64 #ifdef __clang__
  65 #define MVSTATIC
  66 #else
  67 #define MVSTATIC static
  68 #endif
  69
  70 // Missing from the headers on gcc 6 and older, clang 8 and older
  71 typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  72 typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
  73
  74 /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
  75    These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
  76    as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
  77    performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
  78 */
  79
  80 #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
  81 #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
  82 #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
  83 #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
  84
  85 #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
  86 #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
  87
  88 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  89 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  90 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  91
  92 /*
  93   Original loop per 4 bytes:
  94     s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
  95     s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
  96
  97   SSE2/SSSE3 loop per 32 bytes:
  98     int16 t1[8];
  99     int16 t2[8];
 100     for (int j = 0; j < 8; j++) {
 101       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
 102       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
 103     }
 104     s2 += 32*s1 + (uint32)(
 105               28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
 106               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 107           ) + 528*CHAR_OFFSET;
 108     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
 109           32*CHAR_OFFSET;
 110  */
 111 __attribute__ ((target("ssse3"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 112 {
 113     if (len > 32) {
 114         int aligned = ((uintptr_t)buf & 15) == 0;
 115
 116         uint32 x[4] = {0};
 117         x[0] = *ps1;
 118         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
 119         x[0] = *ps2;
 120         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
 121
 122         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
 123         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
 124
 125         for (; i < (len-32); i+=32) {
 126             // Load ... 2*[int8*16]
 127             __m128i in8_1, in8_2;
 128             if (!aligned) {
 129                 // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
 130                 in8_1 = _mm_lddqu_si128((__m128i_u*)&buf[i]);
 131                 in8_2 = _mm_lddqu_si128((__m128i_u*)&buf[i + 16]);
 132             } else {
 133                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
 134                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
 135             }
 136
 137             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
 138             // Fastest, even though multiply by 1
 139             __m128i mul_one = _mm_set1_epi8(1);
 140             __m128i add16_1 = _mm_maddubs_epi16(mul_one, in8_1);
 141             __m128i add16_2 = _mm_maddubs_epi16(mul_one, in8_2);
 142
 143             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
 144             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
 145             __m128i mul_add16_1 = _mm_maddubs_epi16(mul_const, in8_1);
 146             __m128i mul_add16_2 = _mm_maddubs_epi16(mul_const, in8_2);
 147
 148             // s2 += 32*s1
 149             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
 150
 151             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 152             // Shifting left, then shifting right again and shuffling (rather than just
 153             // shifting right as with mul32 below) to cheaply end up with the correct sign
 154             // extension as we go from int16 to int32.
 155             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
 156             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
 157             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
 158             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
 159             sum_add32 = _mm_srai_epi32(sum_add32, 16);
 160             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
 161
 162             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 163             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
 164             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
 165             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
 166             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
 167             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
 168             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
 169
 170             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
 171             ss1 = _mm_add_epi32(ss1, sum_add32);
 172
 173             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 174             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
 175
 176             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
 177             // We could've combined this with generating sum_add32 above and
 178             // save an instruction but benchmarking shows that as being slower
 179             __m128i add16 = _mm_hadds_epi16(add16_1, add16_2);
 180
 181             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
 182             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
 183
 184             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
 185             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
 186             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
 187
 188             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
 189             ss2 = _mm_add_epi32(ss2, mul32);
 190
 191 #if CHAR_OFFSET != 0
 192             // s1 += 32*CHAR_OFFSET
 193             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
 194             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
 195
 196             // s2 += 528*CHAR_OFFSET
 197             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
 198             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
 199 #endif
 200         }
 201
 202         _mm_store_si128((__m128i_u*)x, ss1);
 203         *ps1 = x[0];
 204         _mm_store_si128((__m128i_u*)x, ss2);
 205         *ps2 = x[0];
 206     }
 207     return i;
 208 }
 209
 210 /*
 211   Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
 212   For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
 213   target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
 214   to a near 50% performance drop.
 215  */
 216 __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 217 {
 218     if (len > 32) {
 219         int aligned = ((uintptr_t)buf & 15) == 0;
 220
 221         uint32 x[4] = {0};
 222         x[0] = *ps1;
 223         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
 224         x[0] = *ps2;
 225         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
 226
 227         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
 228         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
 229
 230         for (; i < (len-32); i+=32) {
 231             // Load ... 2*[int8*16]
 232             __m128i in8_1, in8_2;
 233             if (!aligned) {
 234                 in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
 235                 in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
 236             } else {
 237                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
 238                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
 239             }
 240
 241             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
 242             // Fastest, even though multiply by 1
 243             __m128i mul_one = _mm_set1_epi8(1);
 244             __m128i add16_1 = SSE2_MADDUBS_EPI16(mul_one, in8_1);
 245             __m128i add16_2 = SSE2_MADDUBS_EPI16(mul_one, in8_2);
 246
 247             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
 248             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
 249             __m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
 250             __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
 251
 252             // s2 += 32*s1
 253             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
 254
 255             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 256             // Shifting left, then shifting right again and shuffling (rather than just
 257             // shifting right as with mul32 below) to cheaply end up with the correct sign
 258             // extension as we go from int16 to int32.
 259             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
 260             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
 261             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
 262             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
 263             sum_add32 = _mm_srai_epi32(sum_add32, 16);
 264             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
 265
 266             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 267             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
 268             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
 269             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
 270             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
 271             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
 272             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
 273
 274             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
 275             ss1 = _mm_add_epi32(ss1, sum_add32);
 276
 277             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 278             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
 279
 280             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
 281             // We could've combined this with generating sum_add32 above and
 282             // save an instruction but benchmarking shows that as being slower
 283             __m128i add16 = SSE2_HADDS_EPI16(add16_1, add16_2);
 284
 285             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
 286             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
 287
 288             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
 289             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
 290             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
 291
 292             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
 293             ss2 = _mm_add_epi32(ss2, mul32);
 294
 295 #if CHAR_OFFSET != 0
 296             // s1 += 32*CHAR_OFFSET
 297             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
 298             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
 299
 300             // s2 += 528*CHAR_OFFSET
 301             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
 302             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
 303 #endif
 304         }
 305
 306         _mm_store_si128((__m128i_u*)x, ss1);
 307         *ps1 = x[0];
 308         _mm_store_si128((__m128i_u*)x, ss2);
 309         *ps2 = x[0];
 310     }
 311     return i;
 312 }
 313
 314 /*
 315   AVX2 loop per 64 bytes:
 316     int16 t1[16];
 317     int16 t2[16];
 318     for (int j = 0; j < 16; j++) {
 319       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
 320       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
 321     }
 322     s2 += 64*s1 + (uint32)(
 323               60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
 324               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
 325           ) + 2080*CHAR_OFFSET;
 326     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
 327           64*CHAR_OFFSET;
 328  */
 329 __attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 330 {
 331     if (len > 64) {
 332         // Instructions reshuffled compared to SSE2 for slightly better performance
 333         int aligned = ((uintptr_t)buf & 31) == 0;
 334
 335         uint32 x[8] = {0};
 336         x[0] = *ps1;
 337         __m256i ss1 = _mm256_lddqu_si256((__m256i_u*)x);
 338         x[0] = *ps2;
 339         __m256i ss2 = _mm256_lddqu_si256((__m256i_u*)x);
 340
 341         // The order gets shuffled compared to SSE2
 342         const int16 mul_t1_buf[16] = {60, 56, 52, 48, 28, 24, 20, 16, 44, 40, 36, 32, 12, 8, 4, 0};
 343         __m256i mul_t1 = _mm256_lddqu_si256((__m256i_u*)mul_t1_buf);
 344
 345         for (; i < (len-64); i+=64) {
 346             // Load ... 2*[int8*32]
 347             __m256i in8_1, in8_2;
 348             if (!aligned) {
 349                 in8_1 = _mm256_lddqu_si256((__m256i_u*)&buf[i]);
 350                 in8_2 = _mm256_lddqu_si256((__m256i_u*)&buf[i + 32]);
 351             } else {
 352                 in8_1 = _mm256_load_si256((__m256i_u*)&buf[i]);
 353                 in8_2 = _mm256_load_si256((__m256i_u*)&buf[i + 32]);
 354             }
 355
 356             // Prefetch for next loops. This has no observable effect on the
 357             // tested AMD but makes as much as 20% difference on the Intel.
 358             // Curiously that same Intel sees no benefit from this with SSE2
 359             // or SSSE3.
 360             _mm_prefetch(&buf[i + 64], _MM_HINT_T0);
 361             _mm_prefetch(&buf[i + 96], _MM_HINT_T0);
 362             _mm_prefetch(&buf[i + 128], _MM_HINT_T0);
 363             _mm_prefetch(&buf[i + 160], _MM_HINT_T0);
 364
 365             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*16]
 366             // Fastest, even though multiply by 1
 367             __m256i mul_one = _mm256_set1_epi8(1);
 368             __m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
 369             __m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
 370
 371             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*16]
 372             __m256i mul_const = _mm256_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
 373             __m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
 374             __m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
 375
 376             // s2 += 64*s1
 377             ss2 = _mm256_add_epi32(ss2, _mm256_slli_epi32(ss1, 6));
 378
 379             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*16]
 380             __m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
 381
 382             // [t1[0], t1[1], ...] -> [t1[0]*60 + t1[1]*56, ...] [int32*8]
 383             __m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
 384
 385             // [sum(t1[0]..t1[15]), X, X, X, X, X, X, X] [int32*8]
 386             __m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
 387             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_permute4x64_epi64(sum_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
 388             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 2));
 389             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 4));
 390             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 8));
 391             sum_add32 = _mm256_srai_epi32(sum_add32, 16);
 392             sum_add32 = _mm256_shuffle_epi32(sum_add32, 3);
 393
 394             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]
 395             ss1 = _mm256_add_epi32(ss1, sum_add32);
 396
 397             // [sum(t2[0]..t2[15]), X, X, X, X, X, X, X] [int32*8]
 398             __m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
 399             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_permute4x64_epi64(sum_mul_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
 400             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 2));
 401             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 4));
 402             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 8));
 403             sum_mul_add32 = _mm256_srai_epi32(sum_mul_add32, 16);
 404             sum_mul_add32 = _mm256_shuffle_epi32(sum_mul_add32, 3);
 405
 406             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
 407             ss2 = _mm256_add_epi32(ss2, sum_mul_add32);
 408
 409             // [sum(mul32), X, X, X, X, X, X, X] [int32*8]
 410             mul32 = _mm256_add_epi32(mul32, _mm256_permute2x128_si256(mul32, mul32, 1));
 411             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
 412             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
 413
 414             // s2 += 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14]
 415             ss2 = _mm256_add_epi32(ss2, mul32);
 416
 417 #if CHAR_OFFSET != 0
 418             // s1 += 64*CHAR_OFFSET
 419             __m256i char_offset_multiplier = _mm256_set1_epi32(64 * CHAR_OFFSET);
 420             ss1 = _mm256_add_epi32(ss1, char_offset_multiplier);
 421
 422             // s2 += 2080*CHAR_OFFSET
 423             char_offset_multiplier = _mm256_set1_epi32(2080 * CHAR_OFFSET);
 424             ss2 = _mm256_add_epi32(ss2, char_offset_multiplier);
 425 #endif
 426         }
 427
 428         _mm256_store_si256((__m256i_u*)x, ss1);
 429         *ps1 = x[0];
 430         _mm256_store_si256((__m256i_u*)x, ss2);
 431         *ps2 = x[0];
 432     }
 433     return i;
 434 }
 435
 436 static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 437 {
 438     uint32 s1 = *ps1;
 439     uint32 s2 = *ps2;
 440     for (; i < (len-4); i+=4) {
 441         s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
 442         s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
 443     }
 444     for (; i < len; i++) {
 445         s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
 446     }
 447     *ps1 = s1;
 448     *ps2 = s2;
 449     return i;
 450 }
 451
 452 /* With GCC 10 putting this implementation inside 'extern "C"' causes an
 453    assembler error. That worked fine on GCC 5-9 and clang 6-10...
 454   */
 455 static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
 456 {
 457     int32 i = 0;
 458     uint32 s1 = 0;
 459     uint32 s2 = 0;
 460
 461     // multiples of 64 bytes using AVX2 (if available)
 462     i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
 463
 464     // multiples of 32 bytes using SSSE3 (if available)
 465     i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
 466
 467     // multiples of 32 bytes using SSE2 (if available)
 468     i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
 469
 470     // whatever is left
 471     i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
 472
 473     return (s1 & 0xffff) + (s2 << 16);
 474 }
 475
 476 extern "C" {
 477
 478 uint32 get_checksum1(char *buf1, int32 len)
 479 {
 480     return get_checksum1_cpp(buf1, len);
 481 }
 482
 483 } // extern "C"
 484
 485 #ifdef BENCHMARK_SIMD_CHECKSUM1
 486 #pragma clang optimize off
 487 #pragma GCC push_options
 488 #pragma GCC optimize ("O0")
 489
 490 #define ROUNDS 1024
 491 #define BLOCK_LEN 1024*1024
 492
 493 #ifndef CLOCK_MONOTONIC_RAW
 494 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
 495 #endif
 496
 497 static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
 498     struct timespec start, end;
 499     uint64_t us;
 500     uint32_t cs, s1, s2;
 501     int i, next;
 502
 503     clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 504     for (i = 0; i < ROUNDS; i++) {
 505         s1 = s2 = 0;
 506         next = func((schar*)buf, len, 0, &s1, &s2);
 507         get_checksum1_default_1((schar*)buf, len, next, &s1, &s2);
 508     }
 509     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
 510     us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
 511     cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
 512     printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
 513 }
 514
 515 static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
 516     uint32 cs = get_checksum1((char*)buf, len);
 517     *ps1 = cs & 0xffff;
 518     *ps2 = cs >> 16;
 519     return len;
 520 }
 521
 522 int main() {
 523     int i;
 524     unsigned char* buf = (unsigned char*)malloc(BLOCK_LEN);
 525     for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
 526
 527     benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
 528     benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
 529     benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
 530     benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
 531     benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN);
 532
 533     free(buf);
 534     return 0;
 535 }
 536
 537 #pragma GCC pop_options
 538 #pragma clang optimize on
 539 #endif /* BENCHMARK_SIMD_CHECKSUM1 */
 540
 541 #endif /* HAVE_SIMD */
 542 #endif /* __cplusplus */
 543 #endif /* __x86_64__ */