YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/gutil/strings/fastmem.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2008 Google Inc. All Rights Reserved.
2
//
3
// The following only applies to changes made to this file as part of YugaByte development.
4
//
5
// Portions Copyright (c) YugaByte, Inc.
6
//
7
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
8
// in compliance with the License.  You may obtain a copy of the License at
9
//
10
// http://www.apache.org/licenses/LICENSE-2.0
11
//
12
// Unless required by applicable law or agreed to in writing, software distributed under the License
13
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
14
// or implied.  See the License for the specific language governing permissions and limitations
15
// under the License.
16
//
17
// Fast memory copying and comparison routines.
18
//   strings::fastmemcmp_inlined() replaces memcmp()
19
//   strings::memcpy_inlined() replaces memcpy()
20
//   strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
21
//
22
// strings::*_inlined() routines are inline versions of the
23
// routines exported by this module.  Sometimes using the inlined
24
// versions is faster.  Measure before using the inlined versions.
25
//
26
// Performance measurement:
27
//   strings::fastmemcmp_inlined
28
//     Analysis: memcmp, fastmemcmp_inlined, fastmemcmp
29
//     2012-01-30
30
31
#ifndef YB_GUTIL_STRINGS_FASTMEM_H
32
#define YB_GUTIL_STRINGS_FASTMEM_H
33
34
#include <stddef.h>
35
#include <stdint.h>
36
#include <stdio.h>
37
#include <string.h>
38
39
#include "yb/gutil/integral_types.h"
40
#include "yb/gutil/port.h"
41
42
namespace strings {
43
44
// Return true if the n bytes at a equal the n bytes at b.
45
// The regions are allowed to overlap.
46
//
47
// The performance is similar to the performance memcmp(), but faster for
48
// moderately-sized inputs, or inputs that share a common prefix and differ
49
// somewhere in their last 8 bytes. Further optimizations can be added later
50
// if it makes sense to do so.
51
3.40G
inline bool memeq(const void* a_v, const void* b_v, size_t n) {
52
3.40G
  const uint8_t *a = reinterpret_cast<const uint8_t *>(a_v);
53
3.40G
  const uint8_t *b = reinterpret_cast<const uint8_t *>(b_v);
54
55
3.40G
  size_t n_rounded_down = n & ~static_cast<size_t>(7);
56
3.40G
  if (PREDICT_FALSE(n_rounded_down == 0)) {  // n <= 7
57
369M
    return memcmp(a, b, n) == 0;
58
369M
  }
59
  // n >= 8
60
3.03G
  uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
61
3.03G
  uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
62
3.03G
  if ((u | v) != 0) {  // The first or last 8 bytes differ.
63
403M
    return false;
64
403M
  }
65
2.62G
  a += 8;
66
2.62G
  b += 8;
67
2.62G
  n = n_rounded_down - 8;
68
2.62G
  if (n > 128) {
69
    // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2
70
    // instructions, and while we could try to do something faster, it
71
    // doesn't seem worth pursuing.
72
14.9M
    return memcmp(a, b, n) == 0;
73
14.9M
  }
74
4.97G
  
for (; 2.61G
n >= 16;
n -= 162.36G
) {
75
2.36G
    uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
76
2.36G
    uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
77
2.36G
    if ((x | y) != 0) {
78
2.10M
      return false;
79
2.10M
    }
80
2.36G
    a += 16;
81
2.36G
    b += 16;
82
2.36G
  }
83
  // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop.
84
2.61G
  return n == 0 || 
UNALIGNED_LOAD641.00G
(a) == 1.00G
UNALIGNED_LOAD641.00G
(b);
85
2.61G
}
86
87
36.8G
inline int fastmemcmp_inlined(const void *a_void, const void *b_void, size_t n) {
88
36.8G
  const uint8_t *a = reinterpret_cast<const uint8_t *>(a_void);
89
36.8G
  const uint8_t *b = reinterpret_cast<const uint8_t *>(b_void);
90
91
36.8G
  if (n >= 64) {
92
1.17G
    return memcmp(a, b, n);
93
1.17G
  }
94
35.6G
  const void* a_limit = a + n;
95
35.6G
  const size_t sizeof_uint64 = sizeof(uint64);
96
73.8G
  while (a + sizeof_uint64 <= a_limit &&
97
73.8G
         
UNALIGNED_LOAD6467.9G
(a) == 67.9G
UNALIGNED_LOAD6467.9G
(b)) {
98
38.2G
    a += sizeof_uint64;
99
38.2G
    b += sizeof_uint64;
100
38.2G
  }
101
35.6G
  const size_t sizeof_uint32 = sizeof(uint32);
102
35.6G
  if (a + sizeof_uint32 <= a_limit &&
103
35.6G
      
UNALIGNED_LOAD3233.6G
(a) == 33.6G
UNALIGNED_LOAD3233.6G
(b)) {
104
10.9G
    a += sizeof_uint32;
105
10.9G
    b += sizeof_uint32;
106
10.9G
  }
107
79.7G
  while (a < a_limit) {
108
79.1G
    int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++);
109
79.1G
    if (d) 
return d35.1G
;
110
79.1G
  }
111
559M
  return 0;
112
35.6G
}
113
114
// The standard memcpy operation is slow for variable small sizes.
115
// This implementation inlines the optimal realization for sizes 1 to 16.
116
// To avoid code bloat don't use it in case of not performance-critical spots,
117
// nor when you don't expect very frequent values of size <= 16.
118
247M
inline void memcpy_inlined(void *dst, const void *src, size_t size) {
119
  // Compiler inlines code with minimal amount of data movement when third
120
  // parameter of memcpy is a constant.
121
247M
  switch (size) {
122
0
    case  1: memcpy(dst, src, 1); break;
123
0
    case  2: memcpy(dst, src, 2); break;
124
0
    case  3: memcpy(dst, src, 3); break;
125
0
    case  4: memcpy(dst, src, 4); break;
126
5.32M
    case  5: memcpy(dst, src, 5); break;
127
11.2M
    case  6: memcpy(dst, src, 6); break;
128
3.63M
    case  7: memcpy(dst, src, 7); break;
129
120M
    case  8: memcpy(dst, src, 8); break;
130
11.7M
    case  9: memcpy(dst, src, 9); break;
131
5.50M
    case 10: memcpy(dst, src, 10); break;
132
16.2M
    case 11: memcpy(dst, src, 11); break;
133
1.75M
    case 12: memcpy(dst, src, 12); break;
134
2.96M
    case 13: memcpy(dst, src, 13); break;
135
1.51M
    case 14: memcpy(dst, src, 14); break;
136
6.39M
    case 15: memcpy(dst, src, 15); break;
137
1.07M
    case 16: memcpy(dst, src, 16); break;
138
59.1M
    default: memcpy(dst, src, size); break;
139
247M
  }
140
247M
}
141
142
187M
inline size_t MemoryDifferencePos(const void *a_void, const void *b_void, size_t n) {
143
187M
  constexpr size_t kUInt64Size = sizeof(uint64_t);
144
187M
  constexpr size_t kUInt32Size = sizeof(uint32_t);
145
146
187M
  const uint8_t *a = reinterpret_cast<const uint8_t*>(a_void);
147
187M
  const uint8_t *b = reinterpret_cast<const uint8_t*>(b_void);
148
149
187M
  const uint8_t* a_limit = a + n;
150
151
561M
  while (a + kUInt64Size <= a_limit && 
UNALIGNED_LOAD64541M
(a) == 541M
UNALIGNED_LOAD64541M
(b)) {
152
373M
    a += kUInt64Size;
153
373M
    b += kUInt64Size;
154
373M
  }
155
156
187M
  if (a + kUInt32Size <= a_limit && 
UNALIGNED_LOAD32182M
(a) == 182M
UNALIGNED_LOAD32182M
(b)) {
157
71.8M
    a += kUInt32Size;
158
71.8M
    b += kUInt32Size;
159
71.8M
  }
160
161
443M
  while (a < a_limit) {
162
437M
    if (*a != *b) {
163
182M
      return a - reinterpret_cast<const uint8_t*>(a_void);
164
182M
    }
165
255M
    ++a;
166
255M
    ++b;
167
255M
  }
168
169
5.62M
  return n;
170
187M
}
171
172
}  // namespace strings
173
174
#endif // YB_GUTIL_STRINGS_FASTMEM_H