/Users/deen/code/yugabyte-db/src/yb/gutil/strings/fastmem.h

Source (jump to first uncovered line)
// Copyright 2008 Google Inc. All Rights Reserved.
//
// The following only applies to changes made to this file as part of YugaByte development.
//
// Portions Copyright (c) YugaByte, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied.  See the License for the specific language governing permissions and limitations
// under the License.
//
// Fast memory copying and comparison routines.
//   strings::fastmemcmp_inlined() replaces memcmp()
//   strings::memcpy_inlined() replaces memcpy()
//   strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
//
// strings::*_inlined() routines are inline versions of the
// routines exported by this module.  Sometimes using the inlined
// versions is faster.  Measure before using the inlined versions.
//
// Performance measurement:
//   strings::fastmemcmp_inlined
//     Analysis: memcmp, fastmemcmp_inlined, fastmemcmp
//     2012-01-30

#ifndef YB_GUTIL_STRINGS_FASTMEM_H
#define YB_GUTIL_STRINGS_FASTMEM_H

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#include "yb/gutil/integral_types.h"
#include "yb/gutil/port.h"

namespace strings {

// Return true if the n bytes at a equal the n bytes at b.
// The regions are allowed to overlap.
//
// The performance is similar to the performance memcmp(), but faster for
// moderately-sized inputs, or inputs that share a common prefix and differ
// somewhere in their last 8 bytes. Further optimizations can be added later
// if it makes sense to do so.
inline bool memeq(const void* a_v, const void* b_v, size_t n) {
  const uint8_t *a = reinterpret_cast<const uint8_t *>(a_v);
  const uint8_t *b = reinterpret_cast<const uint8_t *>(b_v);

  size_t n_rounded_down = n & ~static_cast<size_t>(7);
  if (PREDICT_FALSE(n_rounded_down == 0)) {  // n <= 7
    return memcmp(a, b, n) == 0;
  }
  // n >= 8
  uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
  uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
  if ((u | v) != 0) {  // The first or last 8 bytes differ.
    return false;
  }
  a += 8;
  b += 8;
  n = n_rounded_down - 8;
  if (n > 128) {
    // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2
    // instructions, and while we could try to do something faster, it
    // doesn't seem worth pursuing.
    return memcmp(a, b, n) == 0;
  }
  for (; 2.61Gn >= 16; n -= 162.36G) {
    uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
    uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
    if ((x | y) != 0) {
      return false;
    }
    a += 16;
    b += 16;
  }
  // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop.
  return n == 0 || UNALIGNED_LOAD641.00G(a) == 1.00GUNALIGNED_LOAD641.00G(b);
}

inline int fastmemcmp_inlined(const void *a_void, const void *b_void, size_t n) {
  const uint8_t *a = reinterpret_cast<const uint8_t *>(a_void);
  const uint8_t *b = reinterpret_cast<const uint8_t *>(b_void);

  if (n >= 64) {
    return memcmp(a, b, n);
  }
  const void* a_limit = a + n;
  const size_t sizeof_uint64 = sizeof(uint64);
  while (a + sizeof_uint64 <= a_limit &&
         UNALIGNED_LOAD6467.9G(a) == 67.9GUNALIGNED_LOAD6467.9G(b)) {
    a += sizeof_uint64;
    b += sizeof_uint64;
  }
  const size_t sizeof_uint32 = sizeof(uint32);
  if (a + sizeof_uint32 <= a_limit &&
      UNALIGNED_LOAD3233.6G(a) == 33.6GUNALIGNED_LOAD3233.6G(b)) {
    a += sizeof_uint32;
    b += sizeof_uint32;
  }
  while (a < a_limit) {
    int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++);
    if (d) return d35.1G;
  }
  return 0;
}

// The standard memcpy operation is slow for variable small sizes.
// This implementation inlines the optimal realization for sizes 1 to 16.
// To avoid code bloat don't use it in case of not performance-critical spots,
// nor when you don't expect very frequent values of size <= 16.
inline void memcpy_inlined(void *dst, const void *src, size_t size) {
  // Compiler inlines code with minimal amount of data movement when third
  // parameter of memcpy is a constant.
  switch (size) {
    case  1: memcpy(dst, src, 1); break;
    case  2: memcpy(dst, src, 2); break;
    case  3: memcpy(dst, src, 3); break;
    case  4: memcpy(dst, src, 4); break;
    case  5: memcpy(dst, src, 5); break;
    case  6: memcpy(dst, src, 6); break;
    case  7: memcpy(dst, src, 7); break;
    case  8: memcpy(dst, src, 8); break;
    case  9: memcpy(dst, src, 9); break;
    case 10: memcpy(dst, src, 10); break;
    case 11: memcpy(dst, src, 11); break;
    case 12: memcpy(dst, src, 12); break;
    case 13: memcpy(dst, src, 13); break;
    case 14: memcpy(dst, src, 14); break;
    case 15: memcpy(dst, src, 15); break;
    case 16: memcpy(dst, src, 16); break;
    default: memcpy(dst, src, size); break;
  }
}

inline size_t MemoryDifferencePos(const void *a_void, const void *b_void, size_t n) {
  constexpr size_t kUInt64Size = sizeof(uint64_t);
  constexpr size_t kUInt32Size = sizeof(uint32_t);

  const uint8_t *a = reinterpret_cast<const uint8_t*>(a_void);
  const uint8_t *b = reinterpret_cast<const uint8_t*>(b_void);

  const uint8_t* a_limit = a + n;

  while (a + kUInt64Size <= a_limit && UNALIGNED_LOAD64541M(a) == 541MUNALIGNED_LOAD64541M(b)) {
    a += kUInt64Size;
    b += kUInt64Size;
  }

  if (a + kUInt32Size <= a_limit && UNALIGNED_LOAD32182M(a) == 182MUNALIGNED_LOAD32182M(b)) {
    a += kUInt32Size;
    b += kUInt32Size;
  }

  while (a < a_limit) {
    if (*a != *b) {
      return a - reinterpret_cast<const uint8_t*>(a_void);
    }
    ++a;
    ++b;
  }

  return n;
}

}  // namespace strings

#endif // YB_GUTIL_STRINGS_FASTMEM_H

Line	Count	Source (jump to first uncovered line)
1		// Copyright 2008 Google Inc. All Rights Reserved.
2		//
3		// The following only applies to changes made to this file as part of YugaByte development.
4		//
5		// Portions Copyright (c) YugaByte, Inc.
6		//
7		// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
8		// in compliance with the License. You may obtain a copy of the License at
9		//
10		// http://www.apache.org/licenses/LICENSE-2.0
11		//
12		// Unless required by applicable law or agreed to in writing, software distributed under the License
13		// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
14		// or implied. See the License for the specific language governing permissions and limitations
15		// under the License.
16		//
17		// Fast memory copying and comparison routines.
18		// strings::fastmemcmp_inlined() replaces memcmp()
19		// strings::memcpy_inlined() replaces memcpy()
20		// strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
21		//
22		// strings::*_inlined() routines are inline versions of the
23		// routines exported by this module. Sometimes using the inlined
24		// versions is faster. Measure before using the inlined versions.
25		//
26		// Performance measurement:
27		// strings::fastmemcmp_inlined
28		// Analysis: memcmp, fastmemcmp_inlined, fastmemcmp
29		// 2012-01-30
30
31		#ifndef YB_GUTIL_STRINGS_FASTMEM_H
32		#define YB_GUTIL_STRINGS_FASTMEM_H
33
34		#include <stddef.h>
35		#include <stdint.h>
36		#include <stdio.h>
37		#include <string.h>
38
39		#include "yb/gutil/integral_types.h"
40		#include "yb/gutil/port.h"
41
42		namespace strings {
43
44		// Return true if the n bytes at a equal the n bytes at b.
45		// The regions are allowed to overlap.
46		//
47		// The performance is similar to the performance memcmp(), but faster for
48		// moderately-sized inputs, or inputs that share a common prefix and differ
49		// somewhere in their last 8 bytes. Further optimizations can be added later
50		// if it makes sense to do so.
51	3.40G	inline bool memeq(const void* a_v, const void* b_v, size_t n) {
52	3.40G	const uint8_t a = reinterpret_cast<const uint8_t >(a_v);
53	3.40G	const uint8_t b = reinterpret_cast<const uint8_t >(b_v);
54
55	3.40G	size_t n_rounded_down = n & ~static_cast<size_t>(7);
56	3.40G	if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7
57	369M	return memcmp(a, b, n) == 0;
58	369M	}
59		// n >= 8
60	3.03G	uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
61	3.03G	uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
62	3.03G	if ((u \| v) != 0) { // The first or last 8 bytes differ.
63	403M	return false;
64	403M	}
65	2.62G	a += 8;
66	2.62G	b += 8;
67	2.62G	n = n_rounded_down - 8;
68	2.62G	if (n > 128) {
69		// As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2
70		// instructions, and while we could try to do something faster, it
71		// doesn't seem worth pursuing.
72	14.9M	return memcmp(a, b, n) == 0;
73	14.9M	}
74	4.97G	for (; 2.61G n >= 16; n -= 162.36G ) {
75	2.36G	uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
76	2.36G	uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
77	2.36G	if ((x \| y) != 0) {
78	2.10M	return false;
79	2.10M	}
80	2.36G	a += 16;
81	2.36G	b += 16;
82	2.36G	}
83		// n must be 0 or 8 now because it was a multiple of 8 at the top of the loop.
84	2.61G	return n == 0 \|\| UNALIGNED_LOAD641.00G (a) == 1.00G UNALIGNED_LOAD641.00G (b);
85	2.61G	}
86
87	36.8G	inline int fastmemcmp_inlined(const void a_void, const void b_void, size_t n) {
88	36.8G	const uint8_t a = reinterpret_cast<const uint8_t >(a_void);
89	36.8G	const uint8_t b = reinterpret_cast<const uint8_t >(b_void);
90
91	36.8G	if (n >= 64) {
92	1.17G	return memcmp(a, b, n);
93	1.17G	}
94	35.6G	const void* a_limit = a + n;
95	35.6G	const size_t sizeof_uint64 = sizeof(uint64);
96	73.8G	while (a + sizeof_uint64 <= a_limit &&
97	73.8G	UNALIGNED_LOAD6467.9G (a) == 67.9G UNALIGNED_LOAD6467.9G (b)) {
98	38.2G	a += sizeof_uint64;
99	38.2G	b += sizeof_uint64;
100	38.2G	}
101	35.6G	const size_t sizeof_uint32 = sizeof(uint32);
102	35.6G	if (a + sizeof_uint32 <= a_limit &&
103	35.6G	UNALIGNED_LOAD3233.6G (a) == 33.6G UNALIGNED_LOAD3233.6G (b)) {
104	10.9G	a += sizeof_uint32;
105	10.9G	b += sizeof_uint32;
106	10.9G	}
107	79.7G	while (a < a_limit) {
108	79.1G	int d = static_cast<uint32>(a++) - static_cast<uint32>(b++);
109	79.1G	if (d) return d35.1G ;
110	79.1G	}
111	559M	return 0;
112	35.6G	}
113
114		// The standard memcpy operation is slow for variable small sizes.
115		// This implementation inlines the optimal realization for sizes 1 to 16.
116		// To avoid code bloat don't use it in case of not performance-critical spots,
117		// nor when you don't expect very frequent values of size <= 16.
118	247M	inline void memcpy_inlined(void dst, const void src, size_t size) {
119		// Compiler inlines code with minimal amount of data movement when third
120		// parameter of memcpy is a constant.
121	247M	switch (size) {
122	0	case 1: memcpy(dst, src, 1); break;
123	0	case 2: memcpy(dst, src, 2); break;
124	0	case 3: memcpy(dst, src, 3); break;
125	0	case 4: memcpy(dst, src, 4); break;
126	5.32M	case 5: memcpy(dst, src, 5); break;
127	11.2M	case 6: memcpy(dst, src, 6); break;
128	3.63M	case 7: memcpy(dst, src, 7); break;
129	120M	case 8: memcpy(dst, src, 8); break;
130	11.7M	case 9: memcpy(dst, src, 9); break;
131	5.50M	case 10: memcpy(dst, src, 10); break;
132	16.2M	case 11: memcpy(dst, src, 11); break;
133	1.75M	case 12: memcpy(dst, src, 12); break;
134	2.96M	case 13: memcpy(dst, src, 13); break;
135	1.51M	case 14: memcpy(dst, src, 14); break;
136	6.39M	case 15: memcpy(dst, src, 15); break;
137	1.07M	case 16: memcpy(dst, src, 16); break;
138	59.1M	default: memcpy(dst, src, size); break;
139	247M	}
140	247M	}
141
142	187M	inline size_t MemoryDifferencePos(const void a_void, const void b_void, size_t n) {
143	187M	constexpr size_t kUInt64Size = sizeof(uint64_t);
144	187M	constexpr size_t kUInt32Size = sizeof(uint32_t);
145
146	187M	const uint8_t a = reinterpret_cast<const uint8_t>(a_void);
147	187M	const uint8_t b = reinterpret_cast<const uint8_t>(b_void);
148
149	187M	const uint8_t* a_limit = a + n;
150
151	561M	while (a + kUInt64Size <= a_limit && UNALIGNED_LOAD64541M (a) == 541M UNALIGNED_LOAD64541M (b)) {
152	373M	a += kUInt64Size;
153	373M	b += kUInt64Size;
154	373M	}
155
156	187M	if (a + kUInt32Size <= a_limit && UNALIGNED_LOAD32182M (a) == 182M UNALIGNED_LOAD32182M (b)) {
157	71.8M	a += kUInt32Size;
158	71.8M	b += kUInt32Size;
159	71.8M	}
160
161	443M	while (a < a_limit) {
162	437M	if (a != b) {
163	182M	return a - reinterpret_cast<const uint8_t*>(a_void);
164	182M	}
165	255M	++a;
166	255M	++b;
167	255M	}
168
169	5.62M	return n;
170	187M	}
171
172		} // namespace strings
173
174		#endif // YB_GUTIL_STRINGS_FASTMEM_H

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43