/Users/deen/code/yugabyte-db/src/yb/gutil/strings/fastmem.h

Source (jump to first uncovered line)
// Copyright 2008 Google Inc. All Rights Reserved.
//
// The following only applies to changes made to this file as part of YugaByte development.
//
// Portions Copyright (c) YugaByte, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied.  See the License for the specific language governing permissions and limitations
// under the License.
//
// Fast memory copying and comparison routines.
//   strings::fastmemcmp_inlined() replaces memcmp()
//   strings::memcpy_inlined() replaces memcpy()
//   strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
//
// strings::*_inlined() routines are inline versions of the
// routines exported by this module.  Sometimes using the inlined
// versions is faster.  Measure before using the inlined versions.
//
// Performance measurement:
//   strings::fastmemcmp_inlined
//     Analysis: memcmp, fastmemcmp_inlined, fastmemcmp
//     2012-01-30

#ifndef YB_GUTIL_STRINGS_FASTMEM_H
#define YB_GUTIL_STRINGS_FASTMEM_H

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#include "yb/gutil/integral_types.h"
#include "yb/gutil/port.h"

namespace strings {

// Return true if the n bytes at a equal the n bytes at b.
// The regions are allowed to overlap.
//
// The performance is similar to the performance memcmp(), but faster for
// moderately-sized inputs, or inputs that share a common prefix and differ
// somewhere in their last 8 bytes. Further optimizations can be added later
// if it makes sense to do so.
inline bool memeq(const void* a_v, const void* b_v, size_t n) {
  const uint8_t *a = reinterpret_cast<const uint8_t *>(a_v);
  const uint8_t *b = reinterpret_cast<const uint8_t *>(b_v);

  size_t n_rounded_down = n & ~static_cast<size_t>(7);
  if (PREDICT_FALSE(n_rounded_down == 0)) {  // n <= 7
    return memcmp(a, b, n) == 0;
  }
  // n >= 8
  uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
  uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
  if ((u | v) != 0) {  // The first or last 8 bytes differ.
    return false;
  }
  a += 8;
  b += 8;
  n = n_rounded_down - 8;
  if (n > 128) {
    // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2
    // instructions, and while we could try to do something faster, it
    // doesn't seem worth pursuing.
    return memcmp(a, b, n) == 0;
  }
  for (; n >= 16; n -= 16) {
    uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
    uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
    if ((x | y) != 0) {
      return false;
    }
    a += 16;
    b += 16;
  }
  // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop.
  return n == 0 || UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b);
}

inline int fastmemcmp_inlined(const void *a_void, const void *b_void, size_t n) {
  const uint8_t *a = reinterpret_cast<const uint8_t *>(a_void);
  const uint8_t *b = reinterpret_cast<const uint8_t *>(b_void);

  if (n >= 64) {
    return memcmp(a, b, n);
  }
  const void* a_limit = a + n;
  const size_t sizeof_uint64 = sizeof(uint64);
  while (a + sizeof_uint64 <= a_limit &&
         UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) {
    a += sizeof_uint64;
    b += sizeof_uint64;
  }
  const size_t sizeof_uint32 = sizeof(uint32);
  if (a + sizeof_uint32 <= a_limit &&
      UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) {
    a += sizeof_uint32;
    b += sizeof_uint32;
  }
  while (a < a_limit) {
    int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++);
    if (d) return d;
  }
  return 0;
}

// The standard memcpy operation is slow for variable small sizes.
// This implementation inlines the optimal realization for sizes 1 to 16.
// To avoid code bloat don't use it in case of not performance-critical spots,
// nor when you don't expect very frequent values of size <= 16.
inline void memcpy_inlined(void *dst, const void *src, size_t size) {
  // Compiler inlines code with minimal amount of data movement when third
  // parameter of memcpy is a constant.
  switch (size) {
    case  1: memcpy(dst, src, 1); break;
    case  2: memcpy(dst, src, 2); break;
    case  3: memcpy(dst, src, 3); break;
    case  4: memcpy(dst, src, 4); break;
    case  5: memcpy(dst, src, 5); break;
    case  6: memcpy(dst, src, 6); break;
    case  7: memcpy(dst, src, 7); break;
    case  8: memcpy(dst, src, 8); break;
    case  9: memcpy(dst, src, 9); break;
    case 10: memcpy(dst, src, 10); break;
    case 11: memcpy(dst, src, 11); break;
    case 12: memcpy(dst, src, 12); break;
    case 13: memcpy(dst, src, 13); break;
    case 14: memcpy(dst, src, 14); break;
    case 15: memcpy(dst, src, 15); break;
    case 16: memcpy(dst, src, 16); break;
    default: memcpy(dst, src, size); break;
  }
}

inline size_t MemoryDifferencePos(const void *a_void, const void *b_void, size_t n) {
  constexpr size_t kUInt64Size = sizeof(uint64_t);
  constexpr size_t kUInt32Size = sizeof(uint32_t);

  const uint8_t *a = reinterpret_cast<const uint8_t*>(a_void);
  const uint8_t *b = reinterpret_cast<const uint8_t*>(b_void);

  const uint8_t* a_limit = a + n;

  while (a + kUInt64Size <= a_limit && UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) {
    a += kUInt64Size;
    b += kUInt64Size;
  }

  if (a + kUInt32Size <= a_limit && UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) {
    a += kUInt32Size;
    b += kUInt32Size;
  }

  while (a < a_limit) {
    if (*a != *b) {
      return a - reinterpret_cast<const uint8_t*>(a_void);
    }
    ++a;
    ++b;
  }

  return n;
}

}  // namespace strings

#endif // YB_GUTIL_STRINGS_FASTMEM_H

Line	Count	Source (jump to first uncovered line)
1		// Copyright 2008 Google Inc. All Rights Reserved.
2		//
3		// The following only applies to changes made to this file as part of YugaByte development.
4		//
5		// Portions Copyright (c) YugaByte, Inc.
6		//
7		// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
8		// in compliance with the License. You may obtain a copy of the License at
9		//
10		// http://www.apache.org/licenses/LICENSE-2.0
11		//
12		// Unless required by applicable law or agreed to in writing, software distributed under the License
13		// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
14		// or implied. See the License for the specific language governing permissions and limitations
15		// under the License.
16		//
17		// Fast memory copying and comparison routines.
18		// strings::fastmemcmp_inlined() replaces memcmp()
19		// strings::memcpy_inlined() replaces memcpy()
20		// strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
21		//
22		// strings::*_inlined() routines are inline versions of the
23		// routines exported by this module. Sometimes using the inlined
24		// versions is faster. Measure before using the inlined versions.
25		//
26		// Performance measurement:
27		// strings::fastmemcmp_inlined
28		// Analysis: memcmp, fastmemcmp_inlined, fastmemcmp
29		// 2012-01-30
30
31		#ifndef YB_GUTIL_STRINGS_FASTMEM_H
32		#define YB_GUTIL_STRINGS_FASTMEM_H
33
34		#include <stddef.h>
35		#include <stdint.h>
36		#include <stdio.h>
37		#include <string.h>
38
39		#include "yb/gutil/integral_types.h"
40		#include "yb/gutil/port.h"
41
42		namespace strings {
43
44		// Return true if the n bytes at a equal the n bytes at b.
45		// The regions are allowed to overlap.
46		//
47		// The performance is similar to the performance memcmp(), but faster for
48		// moderately-sized inputs, or inputs that share a common prefix and differ
49		// somewhere in their last 8 bytes. Further optimizations can be added later
50		// if it makes sense to do so.
51	1.40G	inline bool memeq(const void* a_v, const void* b_v, size_t n) {
52	1.40G	const uint8_t a = reinterpret_cast<const uint8_t >(a_v);
53	1.40G	const uint8_t b = reinterpret_cast<const uint8_t >(b_v);
54
55	1.40G	size_t n_rounded_down = n & ~static_cast<size_t>(7);
56	1.40G	if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7
57	180M	return memcmp(a, b, n) == 0;
58	180M	}
59		// n >= 8
60	1.22G	uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
61	1.22G	uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
62	1.22G	if ((u \| v) != 0) { // The first or last 8 bytes differ.
63	243M	return false;
64	243M	}
65	986M	a += 8;
66	986M	b += 8;
67	986M	n = n_rounded_down - 8;
68	986M	if (n > 128) {
69		// As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2
70		// instructions, and while we could try to do something faster, it
71		// doesn't seem worth pursuing.
72	8.09M	return memcmp(a, b, n) == 0;
73	8.09M	}
74	1.75G	for (; n >= 16; n -= 16) {
75	773M	uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
76	773M	uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
77	773M	if ((x \| y) != 0) {
78	377k	return false;
79	377k	}
80	773M	a += 16;
81	773M	b += 16;
82	773M	}
83		// n must be 0 or 8 now because it was a multiple of 8 at the top of the loop.
84	977M	return n == 0 \|\| UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b);
85	978M	}
86
87	14.5G	inline int fastmemcmp_inlined(const void a_void, const void b_void, size_t n) {
88	14.5G	const uint8_t a = reinterpret_cast<const uint8_t >(a_void);
89	14.5G	const uint8_t b = reinterpret_cast<const uint8_t >(b_void);
90
91	14.5G	if (n >= 64) {
92	411M	return memcmp(a, b, n);
93	411M	}
94	14.1G	const void* a_limit = a + n;
95	14.1G	const size_t sizeof_uint64 = sizeof(uint64);
96	28.1G	while (a + sizeof_uint64 <= a_limit &&
97	25.1G	UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) {
98	14.0G	a += sizeof_uint64;
99	14.0G	b += sizeof_uint64;
100	14.0G	}
101	14.1G	const size_t sizeof_uint32 = sizeof(uint32);
102	14.1G	if (a + sizeof_uint32 <= a_limit &&
103	12.7G	UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) {
104	4.95G	a += sizeof_uint32;
105	4.95G	b += sizeof_uint32;
106	4.95G	}
107	31.3G	while (a < a_limit) {
108	30.8G	int d = static_cast<uint32>(a++) - static_cast<uint32>(b++);
109	30.8G	if (d) return d;
110	30.8G	}
111	466M	return 0;
112	14.1G	}
113
114		// The standard memcpy operation is slow for variable small sizes.
115		// This implementation inlines the optimal realization for sizes 1 to 16.
116		// To avoid code bloat don't use it in case of not performance-critical spots,
117		// nor when you don't expect very frequent values of size <= 16.
118	103M	inline void memcpy_inlined(void dst, const void src, size_t size) {
119		// Compiler inlines code with minimal amount of data movement when third
120		// parameter of memcpy is a constant.
121	103M	switch (size) {
122	0	case 1: memcpy(dst, src, 1); break;
123	0	case 2: memcpy(dst, src, 2); break;
124	0	case 3: memcpy(dst, src, 3); break;
125	0	case 4: memcpy(dst, src, 4); break;
126	2.31M	case 5: memcpy(dst, src, 5); break;
127	4.43M	case 6: memcpy(dst, src, 6); break;
128	2.15M	case 7: memcpy(dst, src, 7); break;
129	45.5M	case 8: memcpy(dst, src, 8); break;
130	5.74M	case 9: memcpy(dst, src, 9); break;
131	2.75M	case 10: memcpy(dst, src, 10); break;
132	6.19M	case 11: memcpy(dst, src, 11); break;
133	817k	case 12: memcpy(dst, src, 12); break;
134	2.15M	case 13: memcpy(dst, src, 13); break;
135	631k	case 14: memcpy(dst, src, 14); break;
136	3.24M	case 15: memcpy(dst, src, 15); break;
137	713k	case 16: memcpy(dst, src, 16); break;
138	27.0M	default: memcpy(dst, src, size); break;
139	103M	}
140	103M	}
141
142	96.6M	inline size_t MemoryDifferencePos(const void a_void, const void b_void, size_t n) {
143	96.6M	constexpr size_t kUInt64Size = sizeof(uint64_t);
144	96.6M	constexpr size_t kUInt32Size = sizeof(uint32_t);
145
146	96.6M	const uint8_t a = reinterpret_cast<const uint8_t>(a_void);
147	96.6M	const uint8_t b = reinterpret_cast<const uint8_t>(b_void);
148
149	96.6M	const uint8_t* a_limit = a + n;
150
151	247M	while (a + kUInt64Size <= a_limit && UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) {
152	150M	a += kUInt64Size;
153	150M	b += kUInt64Size;
154	150M	}
155
156	96.6M	if (a + kUInt32Size <= a_limit && UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) {
157	23.5M	a += kUInt32Size;
158	23.5M	b += kUInt32Size;
159	23.5M	}
160
161	234M	while (a < a_limit) {
162	232M	if (a != b) {
163	94.2M	return a - reinterpret_cast<const uint8_t*>(a_void);
164	94.2M	}
165	138M	++a;
166	138M	++b;
167	138M	}
168
169	2.45M	return n;
170	96.6M	}
171
172		} // namespace strings
173
174		#endif // YB_GUTIL_STRINGS_FASTMEM_H

YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30