simd-checksum-x86_64.cpp

   1 /*
   2  * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
   3  *
   4  * Copyright (C) 1996 Andrew Tridgell
   5  * Copyright (C) 1996 Paul Mackerras
   6  * Copyright (C) 2004-2020 Wayne Davison
   7  * Copyright (C) 2020 Jorrit Jongma
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 3 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, visit the http://fsf.org website.
  21  */
  22 /*
  23  * Optimization target for get_checksum1() was the Intel Atom D2700, the
  24  * slowest CPU in the test set and the most likely to be CPU limited during
  25  * transfers. The combination of intrinsics was chosen specifically for the
  26  * most gain on that CPU, other combinations were occasionally slightly
  27  * faster on the others.
  28  *
  29  * While on more modern CPUs transfers are less likely to be CPU limited
  30  * (at least by this specific function), lower CPU usage is always better.
  31  * Improvements may still be seen when matching chunks from NVMe storage
  32  * even on newer CPUs.
  33  *
  34  * Benchmarks (in MB/s)            C    SSE2   SSSE3    AVX2
  35  * - Intel Atom D2700            550     750    1000     N/A
  36  * - Intel i7-7700hq            1850    2550    4050    6200
  37  * - AMD ThreadRipper 2950x     2900    5600    8950    8100
  38  *
  39  * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
  40  * significantly faster. AVX2 is kept because it's more likely to relieve
  41  * the bottleneck on the slower CPU.
  42  *
  43  * This optimization for get_checksum1() is intentionally limited to x86-64
  44  * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
  45  * the available xmm registers, this optimized version may not be faster than
  46  * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
  47  *
  48  * This file is compiled using GCC 4.8+'s C++ front end to allow the use of
  49  * the target attribute, selecting the fastest code path based on runtime
  50  * detection of CPU capabilities.
  51  */
  52
  53 #ifdef __x86_64__
  54 #ifdef __cplusplus
  55
  56 #include "rsync.h"
  57
  58 #ifdef HAVE_SIMD
  59
  60 #include <immintrin.h>
  61
  62 /* Compatibility functions to let our SSSE3 algorithm run on SSE2 */
  63
  64 __attribute__ ((target("sse2"))) static inline __m128i sse_interleave_odd_epi16(__m128i a, __m128i b) {
  65     return _mm_packs_epi32(
  66         _mm_srai_epi32(a, 16),
  67         _mm_srai_epi32(b, 16)
  68     );
  69 }
  70
  71 __attribute__ ((target("sse2"))) static inline __m128i sse_interleave_even_epi16(__m128i a, __m128i b) {
  72     return sse_interleave_odd_epi16(
  73         _mm_slli_si128(a, 2),
  74         _mm_slli_si128(b, 2)
  75     );
  76 }
  77
  78 __attribute__ ((target("sse2"))) static inline __m128i sse_mulu_odd_epi8(__m128i a, __m128i b) {
  79     return _mm_mullo_epi16(
  80         _mm_srli_epi16(a, 8),
  81         _mm_srai_epi16(b, 8)
  82     );
  83 }
  84
  85 __attribute__ ((target("sse2"))) static inline __m128i sse_mulu_even_epi8(__m128i a, __m128i b) {
  86     return _mm_mullo_epi16(
  87         _mm_and_si128(a, _mm_set1_epi16(0xFF)),
  88         _mm_srai_epi16(_mm_slli_si128(b, 1), 8)
  89     );
  90 }
  91
  92 __attribute__ ((target("sse2"))) static inline __m128i sse_hadds_epi16(__m128i a, __m128i b) {
  93     return _mm_adds_epi16(
  94         sse_interleave_even_epi16(a, b),
  95         sse_interleave_odd_epi16(a, b)
  96     );
  97 }
  98
  99 __attribute__ ((target("ssse3"))) static inline __m128i sse_hadds_epi16(__m128i a, __m128i b) {
 100     return _mm_hadds_epi16(a, b);
 101 }
 102
 103 __attribute__ ((target("sse2"))) static inline __m128i sse_maddubs_epi16(__m128i a, __m128i b) {
 104     return _mm_adds_epi16(
 105         sse_mulu_even_epi8(a, b),
 106         sse_mulu_odd_epi8(a, b)
 107     );
 108 }
 109
 110 __attribute__ ((target("ssse3"))) static inline __m128i sse_maddubs_epi16(__m128i a, __m128i b) {
 111     return _mm_maddubs_epi16(a, b);
 112 }
 113
 114 /* These don't actually get called, but we need to define them. */
 115 __attribute__ ((target("default"))) static inline __m128i sse_interleave_odd_epi16(__m128i a, __m128i b) { return a; }
 116 __attribute__ ((target("default"))) static inline __m128i sse_interleave_even_epi16(__m128i a, __m128i b) { return a; }
 117 __attribute__ ((target("default"))) static inline __m128i sse_mulu_odd_epi8(__m128i a, __m128i b) { return a; }
 118 __attribute__ ((target("default"))) static inline __m128i sse_mulu_even_epi8(__m128i a, __m128i b) { return a; }
 119 __attribute__ ((target("default"))) static inline __m128i sse_hadds_epi16(__m128i a, __m128i b) { return a; }
 120 __attribute__ ((target("default"))) static inline __m128i sse_maddubs_epi16(__m128i a, __m128i b) { return a; }
 121
 122 /*
 123   Original loop per 4 bytes:
 124     s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
 125     s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
 126
 127   SSE2/SSSE3 loop per 32 bytes:
 128     int16 t1[8];
 129     int16 t2[8];
 130     for (int j = 0; j < 8; j++) {
 131       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
 132       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
 133     }
 134     s2 += 32*s1 + (uint32)(
 135               28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
 136               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 137           ) + 528*CHAR_OFFSET;
 138     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
 139           32*CHAR_OFFSET;
 140  */
 141 /*
 142   Both sse2 and ssse3 targets must be specified here or we lose (a lot) of
 143   performance, possibly due to not unrolling+inlining the called targeted
 144   functions.
 145  */
 146 __attribute__ ((target("sse2", "ssse3"))) static int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
 147     if (len > 32) {
 148         int aligned = ((uintptr_t)buf & 15) == 0;
 149
 150         uint32 x[4] = {0};
 151         x[0] = *ps1;
 152         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
 153         x[0] = *ps2;
 154         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
 155
 156         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
 157         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
 158
 159         for (; i < (len-32); i+=32) {
 160             // Load ... 2*[int8*16]
 161             // SSSE3 has _mm_lqqdu_si128, but this requires another
 162             // target function for each SSE2 and SSSE3 loads. For reasons
 163             // unknown (to me) we lose about 10% performance on some CPUs if
 164             // we do that right here. We just use _mm_loadu_si128 as for all
 165             // but a handful of specific old CPUs they are synonymous, and
 166             // take the 1-5% hit on those specific CPUs where it isn't.
 167             __m128i in8_1, in8_2;
 168             if (!aligned) {
 169                 in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
 170                 in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
 171             } else {
 172                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
 173                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
 174             }
 175
 176             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
 177             // Fastest, even though multiply by 1
 178             __m128i mul_one = _mm_set1_epi8(1);
 179             __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1);
 180             __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2);
 181
 182             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
 183             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
 184             __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1);
 185             __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2);
 186
 187             // s2 += 32*s1
 188             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
 189
 190             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 191             // Shifting left, then shifting right again and shuffling (rather than just
 192             // shifting right as with mul32 below) to cheaply end up with the correct sign
 193             // extension as we go from int16 to int32.
 194             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
 195             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
 196             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
 197             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
 198             sum_add32 = _mm_srai_epi32(sum_add32, 16);
 199             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
 200
 201             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 202             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
 203             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
 204             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
 205             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
 206             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
 207             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
 208
 209             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
 210             ss1 = _mm_add_epi32(ss1, sum_add32);
 211
 212             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 213             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
 214
 215             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
 216             // We could've combined this with generating sum_add32 above and
 217             // save an instruction but benchmarking shows that as being slower
 218             __m128i add16 = sse_hadds_epi16(add16_1, add16_2);
 219
 220             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
 221             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
 222
 223             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
 224             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
 225             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
 226
 227             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
 228             ss2 = _mm_add_epi32(ss2, mul32);
 229
 230 #if CHAR_OFFSET != 0
 231             // s1 += 32*CHAR_OFFSET
 232             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
 233             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
 234
 235             // s2 += 528*CHAR_OFFSET
 236             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
 237             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
 238 #endif
 239         }
 240
 241         _mm_store_si128((__m128i_u*)x, ss1);
 242         *ps1 = x[0];
 243         _mm_store_si128((__m128i_u*)x, ss2);
 244         *ps2 = x[0];
 245     }
 246     return i;
 247 }
 248
 249 /*
 250   AVX2 loop per 64 bytes:
 251     int16 t1[16];
 252     int16 t2[16];
 253     for (int j = 0; j < 16; j++) {
 254       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
 255       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
 256     }
 257     s2 += 64*s1 + (uint32)(
 258               60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
 259               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
 260           ) + 2080*CHAR_OFFSET;
 261     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
 262           64*CHAR_OFFSET;
 263  */
 264 __attribute__ ((target("avx2"))) static int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
 265     if (len > 64) {
 266         // Instructions reshuffled compared to SSE2 for slightly better performance
 267         int aligned = ((uintptr_t)buf & 31) == 0;
 268
 269         uint32 x[8] = {0};
 270         x[0] = *ps1;
 271         __m256i ss1 = _mm256_lddqu_si256((__m256i_u*)x);
 272         x[0] = *ps2;
 273         __m256i ss2 = _mm256_lddqu_si256((__m256i_u*)x);
 274
 275         // The order gets shuffled compared to SSE2
 276         const int16 mul_t1_buf[16] = {60, 56, 52, 48, 28, 24, 20, 16, 44, 40, 36, 32, 12, 8, 4, 0};
 277         __m256i mul_t1 = _mm256_lddqu_si256((__m256i_u*)mul_t1_buf);
 278
 279         for (; i < (len-64); i+=64) {
 280             // Load ... 2*[int8*32]
 281             __m256i in8_1, in8_2;
 282             if (!aligned) {
 283                 in8_1 = _mm256_lddqu_si256((__m256i_u*)&buf[i]);
 284                 in8_2 = _mm256_lddqu_si256((__m256i_u*)&buf[i + 32]);
 285             } else {
 286                 in8_1 = _mm256_load_si256((__m256i_u*)&buf[i]);
 287                 in8_2 = _mm256_load_si256((__m256i_u*)&buf[i + 32]);
 288             }
 289
 290             // Prefetch for next loops. This has no observable effect on the
 291             // tested AMD but makes as much as 20% difference on the Intel.
 292             // Curiously that same Intel sees no benefit from this with SSE2
 293             // or SSSE3.
 294             _mm_prefetch(&buf[i + 64], _MM_HINT_T0);
 295             _mm_prefetch(&buf[i + 96], _MM_HINT_T0);
 296             _mm_prefetch(&buf[i + 128], _MM_HINT_T0);
 297             _mm_prefetch(&buf[i + 160], _MM_HINT_T0);
 298
 299             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*16]
 300             // Fastest, even though multiply by 1
 301             __m256i mul_one = _mm256_set1_epi8(1);
 302             __m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
 303             __m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
 304
 305             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*16]
 306             __m256i mul_const = _mm256_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
 307             __m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
 308             __m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
 309
 310             // s2 += 64*s1
 311             ss2 = _mm256_add_epi32(ss2, _mm256_slli_epi32(ss1, 6));
 312
 313             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*16]
 314             __m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
 315
 316             // [t1[0], t1[1], ...] -> [t1[0]*60 + t1[1]*56, ...] [int32*8]
 317             __m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
 318
 319             // [sum(t1[0]..t1[15]), X, X, X, X, X, X, X] [int32*8]
 320             __m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
 321             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_permute4x64_epi64(sum_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
 322             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 2));
 323             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 4));
 324             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 8));
 325             sum_add32 = _mm256_srai_epi32(sum_add32, 16);
 326             sum_add32 = _mm256_shuffle_epi32(sum_add32, 3);
 327
 328             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]
 329             ss1 = _mm256_add_epi32(ss1, sum_add32);
 330
 331             // [sum(t2[0]..t2[15]), X, X, X, X, X, X, X] [int32*8]
 332             __m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
 333             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_permute4x64_epi64(sum_mul_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
 334             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 2));
 335             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 4));
 336             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 8));
 337             sum_mul_add32 = _mm256_srai_epi32(sum_mul_add32, 16);
 338             sum_mul_add32 = _mm256_shuffle_epi32(sum_mul_add32, 3);
 339
 340             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
 341             ss2 = _mm256_add_epi32(ss2, sum_mul_add32);
 342
 343             // [sum(mul32), X, X, X, X, X, X, X] [int32*8]
 344             mul32 = _mm256_add_epi32(mul32, _mm256_permute2x128_si256(mul32, mul32, 1));
 345             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
 346             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
 347
 348             // s2 += 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14]
 349             ss2 = _mm256_add_epi32(ss2, mul32);
 350
 351 #if CHAR_OFFSET != 0
 352             // s1 += 64*CHAR_OFFSET
 353             __m256i char_offset_multiplier = _mm256_set1_epi32(64 * CHAR_OFFSET);
 354             ss1 = _mm256_add_epi32(ss1, char_offset_multiplier);
 355
 356             // s2 += 2080*CHAR_OFFSET
 357             char_offset_multiplier = _mm256_set1_epi32(2080 * CHAR_OFFSET);
 358             ss2 = _mm256_add_epi32(ss2, char_offset_multiplier);
 359 #endif
 360         }
 361
 362         _mm256_store_si256((__m256i_u*)x, ss1);
 363         *ps1 = x[0];
 364         _mm256_store_si256((__m256i_u*)x, ss2);
 365         *ps2 = x[0];
 366     }
 367     return i;
 368 }
 369
 370 __attribute__ ((target("default"))) static int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
 371     return i;
 372 }
 373
 374 __attribute__ ((target("default"))) static int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
 375     return i;
 376 }
 377
 378 static inline int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
 379         uint32 s1 = *ps1;
 380         uint32 s2 = *ps2;
 381         for (; i < (len-4); i+=4) {
 382                 s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
 383                 s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
 384         }
 385         for (; i < len; i++) {
 386                 s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
 387         }
 388         *ps1 = s1;
 389         *ps2 = s2;
 390     return i;
 391 }
 392
 393 extern "C" {
 394
 395 uint32 get_checksum1(char *buf1, int32 len) {
 396     int32 i = 0;
 397     uint32 s1 = 0;
 398     uint32 s2 = 0;
 399
 400     // multiples of 64 bytes using AVX2 (if available)
 401     i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
 402
 403     // multiples of 32 bytes using SSE2/SSSE3 (if available)
 404     i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
 405
 406     // whatever is left
 407     i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
 408
 409     return (s1 & 0xffff) + (s2 << 16);
 410 }
 411
 412 }
 413 #endif /* HAVE_SIMD */
 414 #endif /* __cplusplus */
 415 #endif /* __x86_64__ */