YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/gutil/strings/fastmem.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2008 Google Inc. All Rights Reserved.
2
//
3
// The following only applies to changes made to this file as part of YugaByte development.
4
//
5
// Portions Copyright (c) YugaByte, Inc.
6
//
7
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
8
// in compliance with the License.  You may obtain a copy of the License at
9
//
10
// http://www.apache.org/licenses/LICENSE-2.0
11
//
12
// Unless required by applicable law or agreed to in writing, software distributed under the License
13
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
14
// or implied.  See the License for the specific language governing permissions and limitations
15
// under the License.
16
//
17
// Fast memory copying and comparison routines.
18
//   strings::fastmemcmp_inlined() replaces memcmp()
19
//   strings::memcpy_inlined() replaces memcpy()
20
//   strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
21
//
22
// strings::*_inlined() routines are inline versions of the
23
// routines exported by this module.  Sometimes using the inlined
24
// versions is faster.  Measure before using the inlined versions.
25
//
26
// Performance measurement:
27
//   strings::fastmemcmp_inlined
28
//     Analysis: memcmp, fastmemcmp_inlined, fastmemcmp
29
//     2012-01-30
30
31
#ifndef YB_GUTIL_STRINGS_FASTMEM_H
32
#define YB_GUTIL_STRINGS_FASTMEM_H
33
34
#include <stddef.h>
35
#include <stdint.h>
36
#include <stdio.h>
37
#include <string.h>
38
39
#include "yb/gutil/integral_types.h"
40
#include "yb/gutil/port.h"
41
42
namespace strings {
43
44
// Return true if the n bytes at a equal the n bytes at b.
45
// The regions are allowed to overlap.
46
//
47
// The performance is similar to the performance memcmp(), but faster for
48
// moderately-sized inputs, or inputs that share a common prefix and differ
49
// somewhere in their last 8 bytes. Further optimizations can be added later
50
// if it makes sense to do so.
51
1.40G
inline bool memeq(const void* a_v, const void* b_v, size_t n) {
52
1.40G
  const uint8_t *a = reinterpret_cast<const uint8_t *>(a_v);
53
1.40G
  const uint8_t *b = reinterpret_cast<const uint8_t *>(b_v);
54
55
1.40G
  size_t n_rounded_down = n & ~static_cast<size_t>(7);
56
1.40G
  if (PREDICT_FALSE(n_rounded_down == 0)) {  // n <= 7
57
180M
    return memcmp(a, b, n) == 0;
58
180M
  }
59
  // n >= 8
60
1.22G
  uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
61
1.22G
  uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
62
1.22G
  if ((u | v) != 0) {  // The first or last 8 bytes differ.
63
243M
    return false;
64
243M
  }
65
986M
  a += 8;
66
986M
  b += 8;
67
986M
  n = n_rounded_down - 8;
68
986M
  if (n > 128) {
69
    // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2
70
    // instructions, and while we could try to do something faster, it
71
    // doesn't seem worth pursuing.
72
8.09M
    return memcmp(a, b, n) == 0;
73
8.09M
  }
74
1.75G
  for (; n >= 16; n -= 16) {
75
773M
    uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
76
773M
    uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
77
773M
    if ((x | y) != 0) {
78
377k
      return false;
79
377k
    }
80
773M
    a += 16;
81
773M
    b += 16;
82
773M
  }
83
  // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop.
84
977M
  return n == 0 || UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b);
85
978M
}
86
87
14.5G
inline int fastmemcmp_inlined(const void *a_void, const void *b_void, size_t n) {
88
14.5G
  const uint8_t *a = reinterpret_cast<const uint8_t *>(a_void);
89
14.5G
  const uint8_t *b = reinterpret_cast<const uint8_t *>(b_void);
90
91
14.5G
  if (n >= 64) {
92
411M
    return memcmp(a, b, n);
93
411M
  }
94
14.1G
  const void* a_limit = a + n;
95
14.1G
  const size_t sizeof_uint64 = sizeof(uint64);
96
28.1G
  while (a + sizeof_uint64 <= a_limit &&
97
25.1G
         UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) {
98
14.0G
    a += sizeof_uint64;
99
14.0G
    b += sizeof_uint64;
100
14.0G
  }
101
14.1G
  const size_t sizeof_uint32 = sizeof(uint32);
102
14.1G
  if (a + sizeof_uint32 <= a_limit &&
103
12.7G
      UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) {
104
4.95G
    a += sizeof_uint32;
105
4.95G
    b += sizeof_uint32;
106
4.95G
  }
107
31.3G
  while (a < a_limit) {
108
30.8G
    int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++);
109
30.8G
    if (d) return d;
110
30.8G
  }
111
466M
  return 0;
112
14.1G
}
113
114
// The standard memcpy operation is slow for variable small sizes.
115
// This implementation inlines the optimal realization for sizes 1 to 16.
116
// To avoid code bloat don't use it in case of not performance-critical spots,
117
// nor when you don't expect very frequent values of size <= 16.
118
103M
inline void memcpy_inlined(void *dst, const void *src, size_t size) {
119
  // Compiler inlines code with minimal amount of data movement when third
120
  // parameter of memcpy is a constant.
121
103M
  switch (size) {
122
0
    case  1: memcpy(dst, src, 1); break;
123
0
    case  2: memcpy(dst, src, 2); break;
124
0
    case  3: memcpy(dst, src, 3); break;
125
0
    case  4: memcpy(dst, src, 4); break;
126
2.31M
    case  5: memcpy(dst, src, 5); break;
127
4.43M
    case  6: memcpy(dst, src, 6); break;
128
2.15M
    case  7: memcpy(dst, src, 7); break;
129
45.5M
    case  8: memcpy(dst, src, 8); break;
130
5.74M
    case  9: memcpy(dst, src, 9); break;
131
2.75M
    case 10: memcpy(dst, src, 10); break;
132
6.19M
    case 11: memcpy(dst, src, 11); break;
133
817k
    case 12: memcpy(dst, src, 12); break;
134
2.15M
    case 13: memcpy(dst, src, 13); break;
135
631k
    case 14: memcpy(dst, src, 14); break;
136
3.24M
    case 15: memcpy(dst, src, 15); break;
137
713k
    case 16: memcpy(dst, src, 16); break;
138
27.0M
    default: memcpy(dst, src, size); break;
139
103M
  }
140
103M
}
141
142
96.6M
inline size_t MemoryDifferencePos(const void *a_void, const void *b_void, size_t n) {
143
96.6M
  constexpr size_t kUInt64Size = sizeof(uint64_t);
144
96.6M
  constexpr size_t kUInt32Size = sizeof(uint32_t);
145
146
96.6M
  const uint8_t *a = reinterpret_cast<const uint8_t*>(a_void);
147
96.6M
  const uint8_t *b = reinterpret_cast<const uint8_t*>(b_void);
148
149
96.6M
  const uint8_t* a_limit = a + n;
150
151
247M
  while (a + kUInt64Size <= a_limit && UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) {
152
150M
    a += kUInt64Size;
153
150M
    b += kUInt64Size;
154
150M
  }
155
156
96.6M
  if (a + kUInt32Size <= a_limit && UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) {
157
23.5M
    a += kUInt32Size;
158
23.5M
    b += kUInt32Size;
159
23.5M
  }
160
161
234M
  while (a < a_limit) {
162
232M
    if (*a != *b) {
163
94.2M
      return a - reinterpret_cast<const uint8_t*>(a_void);
164
94.2M
    }
165
138M
    ++a;
166
138M
    ++b;
167
138M
  }
168
169
2.45M
  return n;
170
96.6M
}
171
172
}  // namespace strings
173
174
#endif // YB_GUTIL_STRINGS_FASTMEM_H