/Users/deen/code/yugabyte-db/src/yb/gutil/strings/fastmem.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2008 Google Inc. All Rights Reserved. |
2 | | // |
3 | | // The following only applies to changes made to this file as part of YugaByte development. |
4 | | // |
5 | | // Portions Copyright (c) YugaByte, Inc. |
6 | | // |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
8 | | // in compliance with the License. You may obtain a copy of the License at |
9 | | // |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // |
12 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
13 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
14 | | // or implied. See the License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // |
17 | | // Fast memory copying and comparison routines. |
18 | | // strings::fastmemcmp_inlined() replaces memcmp() |
19 | | // strings::memcpy_inlined() replaces memcpy() |
20 | | // strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0 |
21 | | // |
22 | | // strings::*_inlined() routines are inline versions of the |
23 | | // routines exported by this module. Sometimes using the inlined |
24 | | // versions is faster. Measure before using the inlined versions. |
25 | | // |
26 | | // Performance measurement: |
27 | | // strings::fastmemcmp_inlined |
28 | | // Analysis: memcmp, fastmemcmp_inlined, fastmemcmp |
29 | | // 2012-01-30 |
30 | | |
31 | | #ifndef YB_GUTIL_STRINGS_FASTMEM_H |
32 | | #define YB_GUTIL_STRINGS_FASTMEM_H |
33 | | |
34 | | #include <stddef.h> |
35 | | #include <stdint.h> |
36 | | #include <stdio.h> |
37 | | #include <string.h> |
38 | | |
39 | | #include "yb/gutil/integral_types.h" |
40 | | #include "yb/gutil/port.h" |
41 | | |
42 | | namespace strings { |
43 | | |
44 | | // Return true if the n bytes at a equal the n bytes at b. |
45 | | // The regions are allowed to overlap. |
46 | | // |
47 | | // The performance is similar to the performance memcmp(), but faster for |
48 | | // moderately-sized inputs, or inputs that share a common prefix and differ |
49 | | // somewhere in their last 8 bytes. Further optimizations can be added later |
50 | | // if it makes sense to do so. |
51 | 3.40G | inline bool memeq(const void* a_v, const void* b_v, size_t n) { |
52 | 3.40G | const uint8_t *a = reinterpret_cast<const uint8_t *>(a_v); |
53 | 3.40G | const uint8_t *b = reinterpret_cast<const uint8_t *>(b_v); |
54 | | |
55 | 3.40G | size_t n_rounded_down = n & ~static_cast<size_t>(7); |
56 | 3.40G | if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7 |
57 | 369M | return memcmp(a, b, n) == 0; |
58 | 369M | } |
59 | | // n >= 8 |
60 | 3.03G | uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); |
61 | 3.03G | uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8); |
62 | 3.03G | if ((u | v) != 0) { // The first or last 8 bytes differ. |
63 | 403M | return false; |
64 | 403M | } |
65 | 2.62G | a += 8; |
66 | 2.62G | b += 8; |
67 | 2.62G | n = n_rounded_down - 8; |
68 | 2.62G | if (n > 128) { |
69 | | // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2 |
70 | | // instructions, and while we could try to do something faster, it |
71 | | // doesn't seem worth pursuing. |
72 | 14.9M | return memcmp(a, b, n) == 0; |
73 | 14.9M | } |
74 | 4.97G | for (; 2.61G n >= 16; n -= 162.36G ) { |
75 | 2.36G | uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); |
76 | 2.36G | uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8); |
77 | 2.36G | if ((x | y) != 0) { |
78 | 2.10M | return false; |
79 | 2.10M | } |
80 | 2.36G | a += 16; |
81 | 2.36G | b += 16; |
82 | 2.36G | } |
83 | | // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop. |
84 | 2.61G | return n == 0 || UNALIGNED_LOAD641.00G (a) == 1.00G UNALIGNED_LOAD641.00G (b); |
85 | 2.61G | } |
86 | | |
87 | 36.8G | inline int fastmemcmp_inlined(const void *a_void, const void *b_void, size_t n) { |
88 | 36.8G | const uint8_t *a = reinterpret_cast<const uint8_t *>(a_void); |
89 | 36.8G | const uint8_t *b = reinterpret_cast<const uint8_t *>(b_void); |
90 | | |
91 | 36.8G | if (n >= 64) { |
92 | 1.17G | return memcmp(a, b, n); |
93 | 1.17G | } |
94 | 35.6G | const void* a_limit = a + n; |
95 | 35.6G | const size_t sizeof_uint64 = sizeof(uint64); |
96 | 73.8G | while (a + sizeof_uint64 <= a_limit && |
97 | 73.8G | UNALIGNED_LOAD6467.9G (a) == 67.9G UNALIGNED_LOAD6467.9G (b)) { |
98 | 38.2G | a += sizeof_uint64; |
99 | 38.2G | b += sizeof_uint64; |
100 | 38.2G | } |
101 | 35.6G | const size_t sizeof_uint32 = sizeof(uint32); |
102 | 35.6G | if (a + sizeof_uint32 <= a_limit && |
103 | 35.6G | UNALIGNED_LOAD3233.6G (a) == 33.6G UNALIGNED_LOAD3233.6G (b)) { |
104 | 10.9G | a += sizeof_uint32; |
105 | 10.9G | b += sizeof_uint32; |
106 | 10.9G | } |
107 | 79.7G | while (a < a_limit) { |
108 | 79.1G | int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++); |
109 | 79.1G | if (d) return d35.1G ; |
110 | 79.1G | } |
111 | 559M | return 0; |
112 | 35.6G | } |
113 | | |
114 | | // The standard memcpy operation is slow for variable small sizes. |
115 | | // This implementation inlines the optimal realization for sizes 1 to 16. |
116 | | // To avoid code bloat don't use it in case of not performance-critical spots, |
117 | | // nor when you don't expect very frequent values of size <= 16. |
118 | 247M | inline void memcpy_inlined(void *dst, const void *src, size_t size) { |
119 | | // Compiler inlines code with minimal amount of data movement when third |
120 | | // parameter of memcpy is a constant. |
121 | 247M | switch (size) { |
122 | 0 | case 1: memcpy(dst, src, 1); break; |
123 | 0 | case 2: memcpy(dst, src, 2); break; |
124 | 0 | case 3: memcpy(dst, src, 3); break; |
125 | 0 | case 4: memcpy(dst, src, 4); break; |
126 | 5.32M | case 5: memcpy(dst, src, 5); break; |
127 | 11.2M | case 6: memcpy(dst, src, 6); break; |
128 | 3.63M | case 7: memcpy(dst, src, 7); break; |
129 | 120M | case 8: memcpy(dst, src, 8); break; |
130 | 11.7M | case 9: memcpy(dst, src, 9); break; |
131 | 5.50M | case 10: memcpy(dst, src, 10); break; |
132 | 16.2M | case 11: memcpy(dst, src, 11); break; |
133 | 1.75M | case 12: memcpy(dst, src, 12); break; |
134 | 2.96M | case 13: memcpy(dst, src, 13); break; |
135 | 1.51M | case 14: memcpy(dst, src, 14); break; |
136 | 6.39M | case 15: memcpy(dst, src, 15); break; |
137 | 1.07M | case 16: memcpy(dst, src, 16); break; |
138 | 59.1M | default: memcpy(dst, src, size); break; |
139 | 247M | } |
140 | 247M | } |
141 | | |
142 | 187M | inline size_t MemoryDifferencePos(const void *a_void, const void *b_void, size_t n) { |
143 | 187M | constexpr size_t kUInt64Size = sizeof(uint64_t); |
144 | 187M | constexpr size_t kUInt32Size = sizeof(uint32_t); |
145 | | |
146 | 187M | const uint8_t *a = reinterpret_cast<const uint8_t*>(a_void); |
147 | 187M | const uint8_t *b = reinterpret_cast<const uint8_t*>(b_void); |
148 | | |
149 | 187M | const uint8_t* a_limit = a + n; |
150 | | |
151 | 561M | while (a + kUInt64Size <= a_limit && UNALIGNED_LOAD64541M (a) == 541M UNALIGNED_LOAD64541M (b)) { |
152 | 373M | a += kUInt64Size; |
153 | 373M | b += kUInt64Size; |
154 | 373M | } |
155 | | |
156 | 187M | if (a + kUInt32Size <= a_limit && UNALIGNED_LOAD32182M (a) == 182M UNALIGNED_LOAD32182M (b)) { |
157 | 71.8M | a += kUInt32Size; |
158 | 71.8M | b += kUInt32Size; |
159 | 71.8M | } |
160 | | |
161 | 443M | while (a < a_limit) { |
162 | 437M | if (*a != *b) { |
163 | 182M | return a - reinterpret_cast<const uint8_t*>(a_void); |
164 | 182M | } |
165 | 255M | ++a; |
166 | 255M | ++b; |
167 | 255M | } |
168 | | |
169 | 5.62M | return n; |
170 | 187M | } |
171 | | |
172 | | } // namespace strings |
173 | | |
174 | | #endif // YB_GUTIL_STRINGS_FASTMEM_H |