/Users/deen/code/yugabyte-db/src/yb/gutil/strings/fastmem.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2008 Google Inc. All Rights Reserved. |
2 | | // |
3 | | // The following only applies to changes made to this file as part of YugaByte development. |
4 | | // |
5 | | // Portions Copyright (c) YugaByte, Inc. |
6 | | // |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
8 | | // in compliance with the License. You may obtain a copy of the License at |
9 | | // |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // |
12 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
13 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
14 | | // or implied. See the License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // |
17 | | // Fast memory copying and comparison routines. |
18 | | // strings::fastmemcmp_inlined() replaces memcmp() |
19 | | // strings::memcpy_inlined() replaces memcpy() |
20 | | // strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0 |
21 | | // |
22 | | // strings::*_inlined() routines are inline versions of the |
23 | | // routines exported by this module. Sometimes using the inlined |
24 | | // versions is faster. Measure before using the inlined versions. |
25 | | // |
26 | | // Performance measurement: |
27 | | // strings::fastmemcmp_inlined |
28 | | // Analysis: memcmp, fastmemcmp_inlined, fastmemcmp |
29 | | // 2012-01-30 |
30 | | |
31 | | #ifndef YB_GUTIL_STRINGS_FASTMEM_H |
32 | | #define YB_GUTIL_STRINGS_FASTMEM_H |
33 | | |
34 | | #include <stddef.h> |
35 | | #include <stdint.h> |
36 | | #include <stdio.h> |
37 | | #include <string.h> |
38 | | |
39 | | #include "yb/gutil/integral_types.h" |
40 | | #include "yb/gutil/port.h" |
41 | | |
42 | | namespace strings { |
43 | | |
44 | | // Return true if the n bytes at a equal the n bytes at b. |
45 | | // The regions are allowed to overlap. |
46 | | // |
47 | | // The performance is similar to the performance memcmp(), but faster for |
48 | | // moderately-sized inputs, or inputs that share a common prefix and differ |
49 | | // somewhere in their last 8 bytes. Further optimizations can be added later |
50 | | // if it makes sense to do so. |
51 | 1.40G | inline bool memeq(const void* a_v, const void* b_v, size_t n) { |
52 | 1.40G | const uint8_t *a = reinterpret_cast<const uint8_t *>(a_v); |
53 | 1.40G | const uint8_t *b = reinterpret_cast<const uint8_t *>(b_v); |
54 | | |
55 | 1.40G | size_t n_rounded_down = n & ~static_cast<size_t>(7); |
56 | 1.40G | if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7 |
57 | 180M | return memcmp(a, b, n) == 0; |
58 | 180M | } |
59 | | // n >= 8 |
60 | 1.22G | uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); |
61 | 1.22G | uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8); |
62 | 1.22G | if ((u | v) != 0) { // The first or last 8 bytes differ. |
63 | 243M | return false; |
64 | 243M | } |
65 | 986M | a += 8; |
66 | 986M | b += 8; |
67 | 986M | n = n_rounded_down - 8; |
68 | 986M | if (n > 128) { |
69 | | // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2 |
70 | | // instructions, and while we could try to do something faster, it |
71 | | // doesn't seem worth pursuing. |
72 | 8.09M | return memcmp(a, b, n) == 0; |
73 | 8.09M | } |
74 | 1.75G | for (; n >= 16; n -= 16) { |
75 | 773M | uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); |
76 | 773M | uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8); |
77 | 773M | if ((x | y) != 0) { |
78 | 377k | return false; |
79 | 377k | } |
80 | 773M | a += 16; |
81 | 773M | b += 16; |
82 | 773M | } |
83 | | // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop. |
84 | 977M | return n == 0 || UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b); |
85 | 978M | } |
86 | | |
87 | 14.5G | inline int fastmemcmp_inlined(const void *a_void, const void *b_void, size_t n) { |
88 | 14.5G | const uint8_t *a = reinterpret_cast<const uint8_t *>(a_void); |
89 | 14.5G | const uint8_t *b = reinterpret_cast<const uint8_t *>(b_void); |
90 | | |
91 | 14.5G | if (n >= 64) { |
92 | 411M | return memcmp(a, b, n); |
93 | 411M | } |
94 | 14.1G | const void* a_limit = a + n; |
95 | 14.1G | const size_t sizeof_uint64 = sizeof(uint64); |
96 | 28.1G | while (a + sizeof_uint64 <= a_limit && |
97 | 25.1G | UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) { |
98 | 14.0G | a += sizeof_uint64; |
99 | 14.0G | b += sizeof_uint64; |
100 | 14.0G | } |
101 | 14.1G | const size_t sizeof_uint32 = sizeof(uint32); |
102 | 14.1G | if (a + sizeof_uint32 <= a_limit && |
103 | 12.7G | UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) { |
104 | 4.95G | a += sizeof_uint32; |
105 | 4.95G | b += sizeof_uint32; |
106 | 4.95G | } |
107 | 31.3G | while (a < a_limit) { |
108 | 30.8G | int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++); |
109 | 30.8G | if (d) return d; |
110 | 30.8G | } |
111 | 466M | return 0; |
112 | 14.1G | } |
113 | | |
114 | | // The standard memcpy operation is slow for variable small sizes. |
115 | | // This implementation inlines the optimal realization for sizes 1 to 16. |
116 | | // To avoid code bloat don't use it in case of not performance-critical spots, |
117 | | // nor when you don't expect very frequent values of size <= 16. |
118 | 103M | inline void memcpy_inlined(void *dst, const void *src, size_t size) { |
119 | | // Compiler inlines code with minimal amount of data movement when third |
120 | | // parameter of memcpy is a constant. |
121 | 103M | switch (size) { |
122 | 0 | case 1: memcpy(dst, src, 1); break; |
123 | 0 | case 2: memcpy(dst, src, 2); break; |
124 | 0 | case 3: memcpy(dst, src, 3); break; |
125 | 0 | case 4: memcpy(dst, src, 4); break; |
126 | 2.31M | case 5: memcpy(dst, src, 5); break; |
127 | 4.43M | case 6: memcpy(dst, src, 6); break; |
128 | 2.15M | case 7: memcpy(dst, src, 7); break; |
129 | 45.5M | case 8: memcpy(dst, src, 8); break; |
130 | 5.74M | case 9: memcpy(dst, src, 9); break; |
131 | 2.75M | case 10: memcpy(dst, src, 10); break; |
132 | 6.19M | case 11: memcpy(dst, src, 11); break; |
133 | 817k | case 12: memcpy(dst, src, 12); break; |
134 | 2.15M | case 13: memcpy(dst, src, 13); break; |
135 | 631k | case 14: memcpy(dst, src, 14); break; |
136 | 3.24M | case 15: memcpy(dst, src, 15); break; |
137 | 713k | case 16: memcpy(dst, src, 16); break; |
138 | 27.0M | default: memcpy(dst, src, size); break; |
139 | 103M | } |
140 | 103M | } |
141 | | |
142 | 96.6M | inline size_t MemoryDifferencePos(const void *a_void, const void *b_void, size_t n) { |
143 | 96.6M | constexpr size_t kUInt64Size = sizeof(uint64_t); |
144 | 96.6M | constexpr size_t kUInt32Size = sizeof(uint32_t); |
145 | | |
146 | 96.6M | const uint8_t *a = reinterpret_cast<const uint8_t*>(a_void); |
147 | 96.6M | const uint8_t *b = reinterpret_cast<const uint8_t*>(b_void); |
148 | | |
149 | 96.6M | const uint8_t* a_limit = a + n; |
150 | | |
151 | 247M | while (a + kUInt64Size <= a_limit && UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) { |
152 | 150M | a += kUInt64Size; |
153 | 150M | b += kUInt64Size; |
154 | 150M | } |
155 | | |
156 | 96.6M | if (a + kUInt32Size <= a_limit && UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) { |
157 | 23.5M | a += kUInt32Size; |
158 | 23.5M | b += kUInt32Size; |
159 | 23.5M | } |
160 | | |
161 | 234M | while (a < a_limit) { |
162 | 232M | if (*a != *b) { |
163 | 94.2M | return a - reinterpret_cast<const uint8_t*>(a_void); |
164 | 94.2M | } |
165 | 138M | ++a; |
166 | 138M | ++b; |
167 | 138M | } |
168 | | |
169 | 2.45M | return n; |
170 | 96.6M | } |
171 | | |
172 | | } // namespace strings |
173 | | |
174 | | #endif // YB_GUTIL_STRINGS_FASTMEM_H |