YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/gutil/strings/util.h
Line
Count
Source (jump to first uncovered line)
1
//
2
// Copyright 1999-2006 and onwards Google, Inc.
3
//
4
// Useful string functions and so forth.  This is a grab-bag file.
5
//
6
// You might also want to look at memutil.h, which holds mem*()
7
// equivalents of a lot of the str*() functions in string.h,
8
// eg memstr, mempbrk, etc.
9
//
10
// These functions work fine for UTF-8 strings as long as you can
11
// consider them to be just byte strings.  For example, due to the
12
// design of UTF-8 you do not need to worry about accidental matches,
13
// as long as all your inputs are valid UTF-8 (use \uHHHH, not \xHH or \oOOO).
14
//
15
// Caveats:
16
// * all the lengths in these routines refer to byte counts,
17
//   not character counts.
18
// * case-insensitivity in these routines assumes that all the letters
19
//   in question are in the range A-Z or a-z.
20
//
21
// If you need Unicode specific processing (for example being aware of
22
// Unicode character boundaries, or knowledge of Unicode casing rules,
23
// or various forms of equivalence and normalization), take a look at
24
// files in i18n/utf8.
25
26
//
27
// The following only applies to changes made to this file as part of YugaByte development.
28
//
29
// Portions Copyright (c) YugaByte, Inc.
30
//
31
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
32
// in compliance with the License.  You may obtain a copy of the License at
33
//
34
// http://www.apache.org/licenses/LICENSE-2.0
35
//
36
// Unless required by applicable law or agreed to in writing, software distributed under the License
37
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
38
// or implied.  See the License for the specific language governing permissions and limitations
39
// under the License.
40
//
41
#ifndef YB_GUTIL_STRINGS_UTIL_H
42
#define YB_GUTIL_STRINGS_UTIL_H
43
44
#include <stddef.h>
45
#include <stdio.h>
46
#include <stdlib.h>
47
#include <string.h>
48
#ifndef _MSC_VER
49
#include <strings.h>  // for strcasecmp, but msvc does not have this header
50
#endif
51
52
#include <functional>
53
using std::binary_function;
54
using std::less;
55
#include <string>
56
using std::string;
57
#include <vector>
58
using std::vector;
59
60
#include "yb/gutil/integral_types.h"
61
#include "yb/gutil/port.h"
62
#include "yb/gutil/strings/stringpiece.h"
63
64
// Newer functions.
65
66
namespace strings {
67
68
// Finds the next end-of-line sequence.
69
// An end-of-line sequence is one of:
70
//   \n    common on unix, including mac os x
71
//   \r    common on macos 9 and before
72
//   \r\n  common on windows
73
//
74
// Returns a GStringPiece that contains the end-of-line sequence (a pointer into
75
// the input, 1 or 2 characters long).
76
//
77
// If the input does not contain an end-of-line sequence, returns an empty
78
// GStringPiece located at the end of the input:
79
//    GStringPiece(sp.data() + sp.length(), 0).
80
81
GStringPiece FindEol(GStringPiece sp);
82
83
}  // namespace strings
84
85
// Older functions.
86
87
// Duplicates a non-null, non-empty char* string. Returns a pointer to the new
88
// string, or NULL if the input is null or empty.
89
0
inline char* strdup_nonempty(const char* src) {
90
0
  if (src && src[0]) return strdup(src);
91
0
  return NULL;
92
0
}
93
94
// Finds the first occurrence of a character in at most a given number of bytes
95
// of a char* string. Returns a pointer to the first occurrence, or NULL if no
96
// occurrence found in the first sz bytes.
97
// Never searches past the first null character in the string; therefore, only
98
// suitable for null-terminated strings.
99
// WARNING: Removes const-ness of string argument!
100
0
inline char* strnchr(const char* buf, char c, size_t sz) {
101
0
  const char* end = buf + sz;
102
0
  while (buf != end && *buf) {
103
0
    if (*buf == c)
104
0
      return const_cast<char*>(buf);
105
0
    ++buf;
106
0
  }
107
0
  return NULL;
108
0
}
109
110
// Finds the first occurrence of the null-terminated needle in at most the first
111
// haystack_len bytes of haystack. Returns NULL if needle is not found. Returns
112
// haystack if needle is empty.
113
// WARNING: Removes const-ness of string argument!
114
char* strnstr(const char* haystack, const char* needle, size_t haystack_len);
115
116
// Matches a prefix (which must be a char* literal!) against the beginning of
117
// str. Returns a pointer past the prefix, or NULL if the prefix wasn't matched.
118
// (Like the standard strcasecmp(), but for efficiency doesn't call strlen() on
119
// prefix, and returns a pointer rather than an int.)
120
//
121
// The ""'s catch people who don't pass in a literal for "prefix"
122
#ifndef strprefix
123
#define strprefix(str, prefix) \
124
  (strncmp(str, prefix, sizeof("" prefix "")-1) == 0 ? \
125
      str + sizeof(prefix)-1 :                         \
126
      NULL)
127
#endif
128
129
// Same as strprefix() (immediately above), but matches a case-insensitive
130
// prefix.
131
#ifndef strcaseprefix
132
#define strcaseprefix(str, prefix) \
133
  (strncasecmp(str, prefix, sizeof("" prefix "")-1) == 0 ? \
134
      str + sizeof(prefix)-1 :                             \
135
      NULL)
136
#endif
137
138
// Matches a prefix (up to the first needle_size bytes of needle) in the first
139
// haystack_size byte of haystack. Returns a pointer past the prefix, or NULL if
140
// the prefix wasn't matched. (Unlike strprefix(), prefix doesn't need to be a
141
// char* literal. Like the standard strncmp(), but also takes a haystack_size,
142
// and returns a pointer rather than an int.)
143
//
144
// Always returns either NULL or haystack + needle_size.
145
//
146
// Some windows header sometimes #defines strnprefix to something we
147
// don't want.
148
#ifdef strnprefix
149
#undef strnprefix
150
#endif
151
const char* strnprefix(const char* haystack, int haystack_size,
152
                       const char* needle, int needle_size);
153
154
// Matches a case-insensitive prefix (up to the first needle_size bytes of
155
// needle) in the first haystack_size byte of haystack. Returns a pointer past
156
// the prefix, or NULL if the prefix wasn't matched.
157
//
158
// Always returns either NULL or haystack + needle_size.
159
const char* strncaseprefix(const char* haystack, int haystack_size,
160
                           const char* needle, int needle_size);
161
162
// Matches a prefix; returns a pointer past the prefix, or NULL if not found.
163
// (Like strprefix() and strcaseprefix() but not restricted to searching for
164
// char* literals). Templated so searching a const char* returns a const char*,
165
// and searching a non-const char* returns a non-const char*.
166
template<class CharStar>
167
inline CharStar var_strprefix(CharStar str, const char* prefix) {
168
  const auto len = strlen(prefix);
169
  return strncmp(str, prefix, len) == 0 ?  str + len : NULL;
170
}
171
172
// Same as var_strprefix() (immediately above), but matches a case-insensitive
173
// prefix.
174
template<class CharStar>
175
inline CharStar var_strcaseprefix(CharStar str, const char* prefix) {
176
  const auto len = strlen(prefix);
177
  return strncasecmp(str, prefix, len) == 0 ?  str + len : NULL;
178
}
179
180
// Returns input, or "(null)" if NULL. (Useful for logging.)
181
0
inline const char* GetPrintableString(const char* const in) {
182
0
  return NULL == in ? "(null)" : in;
183
0
}
184
185
// Returns whether str begins with prefix.
186
inline bool HasPrefixString(const GStringPiece& str,
187
342k
                            const GStringPiece& prefix) {
188
342k
  return str.starts_with(prefix);
189
342k
}
190
191
// Returns whether str ends with suffix.
192
inline bool HasSuffixString(const GStringPiece& str,
193
133k
                            const GStringPiece& suffix) {
194
133k
  return str.ends_with(suffix);
195
133k
}
196
197
// Returns true if the string passed in matches the pattern. The pattern
198
// string can contain wildcards like * and ?
199
// The backslash character (\) is an escape character for * and ?
200
// We limit the patterns to having a max of 16 * or ? characters.
201
// ? matches 0 or 1 character, while * matches 0 or more characters.
202
bool MatchPattern(const GStringPiece& string,
203
                  const GStringPiece& pattern);
204
205
// Returns where suffix begins in str, or NULL if str doesn't end with suffix.
206
0
inline char* strsuffix(char* str, const char* suffix) {
207
0
  const auto lenstr = strlen(str);
208
0
  const auto lensuffix = strlen(suffix);
209
0
  char* strbeginningoftheend = str + lenstr - lensuffix;
210
0
211
0
  if (lenstr >= lensuffix && 0 == strcmp(strbeginningoftheend, suffix)) {
212
0
    return (strbeginningoftheend);
213
0
  } else {
214
0
    return (NULL);
215
0
  }
216
0
}
217
0
inline const char* strsuffix(const char* str, const char* suffix) {
218
0
  return const_cast<const char*>(strsuffix(const_cast<char*>(str), suffix));
219
0
}
220
221
// Same as strsuffix() (immediately above), but matches a case-insensitive
222
// suffix.
223
char* strcasesuffix(char* str, const char* suffix);
224
0
inline const char* strcasesuffix(const char* str, const char* suffix) {
225
0
  return const_cast<const char*>(strcasesuffix(const_cast<char*>(str), suffix));
226
0
}
227
228
const char* strnsuffix(const char* haystack, int haystack_size,
229
                       const char* needle, int needle_size);
230
const char* strncasesuffix(const char* haystack, int haystack_size,
231
                           const char* needle, int needle_size);
232
233
// Returns the number of times a character occurs in a string for a null
234
// terminated string.
235
0
inline ptrdiff_t strcount(const char* buf, char c) {
236
0
  if (buf == NULL)
237
0
    return 0;
238
0
  ptrdiff_t num = 0;
239
0
  for (const char* bp = buf; *bp != '\0'; bp++) {
240
0
    if (*bp == c)
241
0
      num++;
242
0
  }
243
0
  return num;
244
0
}
245
// Returns the number of times a character occurs in a string for a string
246
// defined by a pointer to the first character and a pointer just past the last
247
// character.
248
0
inline ptrdiff_t strcount(const char* buf_begin, const char* buf_end, char c) {
249
0
  if (buf_begin == NULL)
250
0
    return 0;
251
0
  if (buf_end <= buf_begin)
252
0
    return 0;
253
0
  ptrdiff_t num = 0;
254
0
  for (const char* bp = buf_begin; bp != buf_end; bp++) {
255
0
    if (*bp == c)
256
0
      num++;
257
0
  }
258
0
  return num;
259
0
}
260
// Returns the number of times a character occurs in a string for a string
261
// defined by a pointer to the first char and a length:
262
0
inline ptrdiff_t strcount(const char* buf, size_t len, char c) {
263
0
  return strcount(buf, buf + len, c);
264
0
}
265
// Returns the number of times a character occurs in a string for a C++ string:
266
0
inline ptrdiff_t strcount(const string& buf, char c) {
267
0
  return strcount(buf.c_str(), buf.size(), c);
268
0
}
269
270
// Returns a pointer to the nth occurrence of a character in a null-terminated
271
// string.
272
// WARNING: Removes const-ness of string argument!
273
char* strchrnth(const char* str, const char& c, int n);
274
275
// Returns a pointer to the nth occurrence of a character in a null-terminated
276
// string, or the last occurrence if occurs fewer than n times.
277
// WARNING: Removes const-ness of string argument!
278
char* AdjustedLastPos(const char* str, char separator, int n);
279
280
// STL-compatible function objects for char* string keys:
281
282
// Compares two char* strings for equality. (Works with NULL, which compares
283
// equal only to another NULL). Useful in hash tables:
284
//    hash_map<const char*, Value, hash<const char*>, streq> ht;
285
struct streq : public binary_function<const char*, const char*, bool> {
286
0
  bool operator()(const char* s1, const char* s2) const {
287
0
    return ((s1 == 0 && s2 == 0) ||
288
0
            (s1 && s2 && *s1 == *s2 && strcmp(s1, s2) == 0));
289
0
  }
290
};
291
292
// Compares two char* strings. (Works with NULL, which compares greater than any
293
// non-NULL). Useful in maps:
294
//    map<const char*, Value, strlt> m;
295
struct strlt : public binary_function<const char*, const char*, bool> {
296
0
  bool operator()(const char* s1, const char* s2) const {
297
0
    return (s1 != s2) && (s2 == 0 || (s1 != 0 && strcmp(s1, s2) < 0));
298
0
  }
299
};
300
301
// Returns whether str has only Ascii characters (as defined by ascii_isascii()
302
// in strings/ascii_ctype.h).
303
bool IsAscii(const char* str, size_t len);
304
0
inline bool IsAscii(const GStringPiece& str) {
305
0
  return IsAscii(str.data(), str.size());
306
0
}
307
308
// Returns the immediate lexicographically-following string. This is useful to
309
// turn an inclusive range into something that can be used with Bigtable's
310
// SetLimitRow():
311
//
312
//     // Inclusive range [min_element, max_element].
313
//     string min_element = ...;
314
//     string max_element = ...;
315
//
316
//     // Equivalent range [range_start, range_end).
317
//     string range_start = min_element;
318
//     string range_end = ImmediateSuccessor(max_element);
319
//
320
// WARNING: Returns the input string with a '\0' appended; if you call c_str()
321
// on the result, it will compare equal to s.
322
//
323
// WARNING: Transforms "" -> "\0"; this doesn't account for Bigtable's special
324
// treatment of "" as infinity.
325
string ImmediateSuccessor(const GStringPiece& s);
326
327
// Copies at most n-1 bytes from src to dest, and returns dest. If n >=1, null
328
// terminates dest; otherwise, returns dest unchanged. Unlike strncpy(), only
329
// puts one null character at the end of dest.
330
0
inline char* safestrncpy(char* dest, const char* src, size_t n) {
331
0
  if (n < 1) return dest;
332
0
333
0
  // Avoid using non-ANSI memccpy(), which is also deprecated in MSVC
334
0
  for (size_t i = 0; i < n; ++i) {
335
0
    if ((dest[i] = src[i]) == '\0')
336
0
      return dest;
337
0
  }
338
0
339
0
  dest[n-1] = '\0';
340
0
  return dest;
341
0
}
342
343
namespace strings {
344
345
// BSD-style safe and consistent string copy functions.
346
// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|.
347
// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as
348
// long as |dst_size| is not 0.  Returns the length of |src| in characters.
349
// If the return value is >= dst_size, then the output was truncated.
350
// NOTE: All sizes are in number of characters, NOT in bytes.
351
size_t strlcpy(char* dst, const char* src, size_t dst_size);
352
353
} // namespace strings
354
355
// Replaces the first occurrence (if replace_all is false) or all occurrences
356
// (if replace_all is true) of oldsub in s with newsub. In the second version,
357
// *res must be distinct from all the other arguments.
358
string StringReplace(const GStringPiece& s, const GStringPiece& oldsub,
359
                     const GStringPiece& newsub, bool replace_all);
360
void StringReplace(const GStringPiece& s, const GStringPiece& oldsub,
361
                   const GStringPiece& newsub, bool replace_all,
362
                   string* res);
363
364
// Replaces all occurrences of substring in s with replacement. Returns the
365
// number of instances replaced. s must be distinct from the other arguments.
366
//
367
// Less flexible, but faster, than RE::GlobalReplace().
368
int GlobalReplaceSubstring(const GStringPiece& substring,
369
                           const GStringPiece& replacement,
370
                           string* s);
371
372
// Removes v[i] for every element i in indices. Does *not* preserve the order of
373
// v. indices must be sorted in strict increasing order (no duplicates). Runs in
374
// O(indices.size()).
375
void RemoveStrings(vector<string>* v, const vector<size_t>& indices);
376
377
// Case-insensitive strstr(); use system strcasestr() instead.
378
// WARNING: Removes const-ness of string argument!
379
char* gstrcasestr(const char* haystack, const char* needle);
380
381
// Finds (case insensitively) the first occurrence of (null terminated) needle
382
// in at most the first len bytes of haystack. Returns a pointer into haystack,
383
// or NULL if needle wasn't found.
384
// WARNING: Removes const-ness of haystack!
385
const char* gstrncasestr(const char* haystack, const char* needle, size_t len);
386
char* gstrncasestr(char* haystack, const char* needle, size_t len);
387
388
// Finds (case insensitively), in str (which is a list of tokens separated by
389
// non_alpha), a token prefix and a token suffix. Returns a pointer into str of
390
// the position of prefix, or NULL if not found.
391
// WARNING: Removes const-ness of string argument!
392
char* gstrncasestr_split(const char* str,
393
                         const char* prefix, char non_alpha,
394
                         const char* suffix,
395
                         size_t n);
396
397
// Finds (case insensitively) needle in haystack, paying attention only to
398
// alphanumerics in either string. Returns a pointer into haystack, or NULL if
399
// not found.
400
// Example: strcasestr_alnum("This is a longer test string", "IS-A-LONGER")
401
// returns a pointer to "is a longer".
402
// WARNING: Removes const-ness of string argument!
403
char* strcasestr_alnum(const char* haystack, const char* needle);
404
405
// Returns the number times substring appears in text.
406
// Note: Runs in O(text.length() * substring.length()). Do *not* use on long
407
// strings.
408
int CountSubstring(GStringPiece text, GStringPiece substring);
409
410
// Finds, in haystack (which is a list of tokens separated by delim), an token
411
// equal to needle. Returns a pointer into haystack, or NULL if not found (or
412
// either needle or haystack is empty).
413
const char* strstr_delimited(const char* haystack,
414
                             const char* needle,
415
                             char delim);
416
417
// Gets the next token from string *stringp, where tokens are strings separated
418
// by characters from delim.
419
char* gstrsep(char** stringp, const char* delim);
420
421
// Appends GStringPiece(data, len) to *s.
422
void FastStringAppend(string* s, const char* data, size_t len);
423
424
// Returns a duplicate of the_string, with memory allocated by new[].
425
char* strdup_with_new(const char* the_string);
426
427
// Returns a duplicate of up to the first max_length bytes of the_string, with
428
// memory allocated by new[].
429
char* strndup_with_new(const char* the_string, size_t max_length);
430
431
// Finds, in the_string, the first "word" (consecutive !ascii_isspace()
432
// characters). Returns pointer to the beginning of the word, and sets *end_ptr
433
// to the character after the word (which may be space or '\0'); returns NULL
434
// (and *end_ptr is undefined) if no next word found.
435
// end_ptr must not be NULL.
436
const char* ScanForFirstWord(const char* the_string, const char** end_ptr);
437
0
inline char* ScanForFirstWord(char* the_string, char** end_ptr) {
438
0
  // implicit_cast<> would be more appropriate for casting to const,
439
0
  // but we save the inclusion of "base/casts.h" here by using const_cast<>.
440
0
  return const_cast<char*>(
441
0
      ScanForFirstWord(const_cast<const char*>(the_string),
442
0
                       const_cast<const char**>(end_ptr)));
443
0
}
444
445
// For the following functions, an "identifier" is a letter or underscore,
446
// followed by letters, underscores, or digits.
447
448
// Returns a pointer past the end of the "identifier" (see above) beginning at
449
// str, or NULL if str doesn't start with an identifier.
450
const char* AdvanceIdentifier(const char* str);
451
0
inline char* AdvanceIdentifier(char* str) {
452
0
  // implicit_cast<> would be more appropriate for casting to const,
453
0
  // but we save the inclusion of "base/casts.h" here by using const_cast<>.
454
0
  return const_cast<char*>(AdvanceIdentifier(const_cast<const char*>(str)));
455
0
}
456
457
// Returns whether str is an "identifier" (see above).
458
bool IsIdentifier(const char* str);
459
460
// Finds the first tag and value in a string of tag/value pairs.
461
//
462
// The first pair begins after the first occurrence of attribute_separator (or
463
// string_terminal, if not '\0'); tag_value_separator separates the tag and
464
// value; and the value ends before the following occurrence of
465
// attribute_separator (or string_terminal, if not '\0').
466
//
467
// Returns true (and populates tag, tag_len, value, and value_len) if a
468
// tag/value pair is founds; returns false otherwise.
469
bool FindTagValuePair(const char* in_str, char tag_value_separator,
470
                      char attribute_separator, char string_terminal,
471
                      char** tag, size_t* tag_len,
472
                      char** value, size_t* value_len);
473
474
// Inserts separator after every interval characters in *s (but never appends to
475
// the end of the original *s).
476
void UniformInsertString(string* s, int interval, const char* separator);
477
478
// Inserts separator into s at each specified index. indices must be sorted in
479
// ascending order.
480
void InsertString(
481
    string* s, const vector<uint32>& indices, char const* separator);
482
483
// Finds the nth occurrence of c in n; returns the index in s of that
484
// occurrence, or string::npos if fewer than n occurrences.
485
size_t FindNth(GStringPiece s, char c, size_t n);
486
487
// Finds the nth-to-last occurrence of c in s; returns the index in s of that
488
// occurrence, or string::npos if fewer than n occurrences.
489
size_t ReverseFindNth(GStringPiece s, char c, size_t n);
490
491
// Returns whether s contains only whitespace characters (including the case
492
// where s is empty).
493
bool OnlyWhitespace(const GStringPiece& s);
494
495
// Formats a string in the same fashion as snprintf(), but returns either the
496
// number of characters written, or zero if not enough space was available.
497
// (snprintf() returns the number of characters that would have been written if
498
// enough space had been available.)
499
//
500
// A drop-in replacement for the safe_snprintf() macro.
501
int SafeSnprintf(char* str, size_t size, const char* format, ...)
502
    PRINTF_ATTRIBUTE(3, 4);
503
504
// Reads a line (terminated by delim) from file into *str. Reads delim from
505
// file, but doesn't copy it into *str. Returns true if read a delim-terminated
506
// line, or false on end-of-file or error.
507
bool GetlineFromStdioFile(FILE* file, string* str, char delim);
508
509
#endif  // YB_GUTIL_STRINGS_UTIL_H