YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/gutil/strings/escaping.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2006 Google Inc. All Rights Reserved.
2
// Authors: Numerous. Principal maintainers are csilvers and zunger.
3
//
4
// This is a grab-bag file for string utilities involved in escaping and
5
// unescaping strings in various ways. Who knew there were so many?
6
//
7
// NOTE: Although the functions declared here have been imported into
8
// the global namespace, the using statements are slated for removal.
9
// Do not refer to these symbols without properly namespace-qualifying
10
// them with "strings::". Of course you may also use "using" statements
11
// within a .cc file.
12
//
13
// There are more escaping functions in:
14
//   webutil/html/tagutils.h (Escaping strings for HTML, PRE, JavaScript, etc.)
15
//   webutil/url/url.h (Escaping for URL's, both RFC-2396 and other methods)
16
//   template/template_modifiers.h (All sorts of stuff)
17
//   util/regex/re2/re2.h (Escaping for literals within regular expressions
18
//                         - see RE2::QuoteMeta).
19
// And probably many more places, as well.
20
21
//
22
// The following only applies to changes made to this file as part of YugaByte development.
23
//
24
// Portions Copyright (c) YugaByte, Inc.
25
//
26
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
27
// in compliance with the License.  You may obtain a copy of the License at
28
//
29
// http://www.apache.org/licenses/LICENSE-2.0
30
//
31
// Unless required by applicable law or agreed to in writing, software distributed under the License
32
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
33
// or implied.  See the License for the specific language governing permissions and limitations
34
// under the License.
35
//
36
#ifndef YB_GUTIL_STRINGS_ESCAPING_H
37
#define YB_GUTIL_STRINGS_ESCAPING_H
38
39
#include <stddef.h>
40
41
#include <string>
42
#include <vector>
43
44
#include <glog/logging.h>
45
46
#include "yb/gutil/strings/ascii_ctype.h"
47
#include "yb/gutil/strings/charset.h"
48
#include "yb/gutil/strings/stringpiece.h"
49
50
using std::string;
51
using std::vector;
52
53
54
55
namespace strings {
56
57
// ----------------------------------------------------------------------
58
// EscapeStrForCSV()
59
//    Escapes the quotes in 'src' by doubling them. This is necessary
60
//    for generating CSV files (see SplitCSVLine).
61
//    Returns the number of characters written into dest (not counting
62
//    the \0) or -1 if there was insufficient space.
63
//
64
//    Example: [some "string" to test] --> [some ""string"" to test]
65
// ----------------------------------------------------------------------
66
size_t EscapeStrForCSV(const char* src, char* dest, size_t dest_len);
67
68
// ----------------------------------------------------------------------
69
// UnescapeCEscapeSequences()
70
//    Copies "source" to "dest", rewriting C-style escape sequences
71
//    -- '\n', '\r', '\\', '\ooo', etc -- to their ASCII
72
//    equivalents.  "dest" must be sufficiently large to hold all
73
//    the characters in the rewritten string (i.e. at least as large
74
//    as strlen(source) + 1 should be safe, since the replacements
75
//    are always shorter than the original escaped sequences).  It's
76
//    safe for source and dest to be the same.  RETURNS the length
77
//    of dest.
78
//
79
//    It allows hex sequences \xhh, or generally \xhhhhh with an
80
//    arbitrary number of hex digits, but all of them together must
81
//    specify a value of a single byte (e.g. \x0045 is equivalent
82
//    to \x45, and \x1234 is erroneous). If the value is too large,
83
//    it is truncated to 8 bits and an error is set. This is also
84
//    true of octal values that exceed 0xff.
85
//
86
//    It also allows escape sequences of the form \uhhhh (exactly four
87
//    hex digits, upper or lower case) or \Uhhhhhhhh (exactly eight
88
//    hex digits, upper or lower case) to specify a Unicode code
89
//    point. The dest array will contain the UTF8-encoded version of
90
//    that code-point (e.g., if source contains \u2019, then dest will
91
//    contain the three bytes 0xE2, 0x80, and 0x99). For the inverse
92
//    transformation, use UniLib::UTF8EscapeString
93
//    (util/utf8/public/unilib.h), not CEscapeString.
94
//
95
//    Errors: In the first form of the call, errors are reported with
96
//    LOG(ERROR). The same is true for the second form of the call if
97
//    the pointer to the string vector is NULL; otherwise, error
98
//    messages are stored in the vector. In either case, the effect on
99
//    the dest array is not defined, but rest of the source will be
100
//    processed.
101
//
102
//    *** DEPRECATED: Use CUnescape() in new code ***
103
//    ----------------------------------------------------------------------
104
size_t UnescapeCEscapeSequences(const char* source, char* dest);
105
size_t UnescapeCEscapeSequences(const char* source, char* dest, vector<string>* errors);
106
107
// ----------------------------------------------------------------------
108
// UnescapeCEscapeString()
109
//    This does the same thing as UnescapeCEscapeSequences, but creates
110
//    a new string. The caller does not need to worry about allocating
111
//    a dest buffer. This should be used for non performance critical
112
//    tasks such as printing debug messages. It is safe for src and dest
113
//    to be the same.
114
//
115
//    The second call stores its errors in a supplied string vector.
116
//    If the string vector pointer is NULL, it reports the errors with LOG().
117
//
118
//    In the first and second calls, the length of dest is returned. In the
119
//    the third call, the new string is returned.
120
//
121
//    *** DEPRECATED: Use CUnescape() in new code ***
122
// ----------------------------------------------------------------------
123
size_t UnescapeCEscapeString(const string& src, string* dest);
124
size_t UnescapeCEscapeString(const string& src, string* dest, vector<string>* errors);
125
string UnescapeCEscapeString(const string& src);
126
127
// ----------------------------------------------------------------------
128
// CUnescape()
129
//    Copies "source" to "dest", rewriting C-style escape sequences
130
//    -- '\n', '\r', '\\', '\ooo', etc -- to their ASCII
131
//    equivalents.  "dest" must be sufficiently large to hold all
132
//    the characters in the rewritten string (i.e. at least as large
133
//    as source.size() should be safe, since the replacements
134
//    are never longer than the original escaped sequences).  It's
135
//    safe for source and dest to be the same.  RETURNS true if
136
//    conversion was successful, false otherwise. Stores the size of
137
//    the result in 'dest_len'.
138
//
139
//    It allows hex sequences \xhh, or generally \xhhhhh with an
140
//    arbitrary number of hex digits, but all of them together must
141
//    specify a value of a single byte (e.g. \x0045 is equivalent
142
//    to \x45, and \x1234 is erroneous). If the value is too large,
143
//    an error is set. This is also true of octal values that exceed 0xff.
144
//
145
//    It also allows escape sequences of the form \uhhhh (exactly four
146
//    hex digits, upper or lower case) or \Uhhhhhhhh (exactly eight
147
//    hex digits, upper or lower case) to specify a Unicode code
148
//    point. The dest array will contain the UTF8-encoded version of
149
//    that code-point (e.g., if source contains \u2019, then dest will
150
//    contain the three bytes 0xE2, 0x80, and 0x99). For the inverse
151
//    transformation, use UniLib::UTF8EscapeString
152
//    (util/utf8/public/unilib.h), not CEscapeString.
153
//
154
//    Errors: Sets the description of the first encountered error in
155
//    'error'. To disable error reporting, set 'error' to NULL.
156
// ----------------------------------------------------------------------
157
bool CUnescape(const GStringPiece& source, char* dest, size_t* dest_len, string* error);
158
159
bool CUnescape(const GStringPiece& source, string* dest, string* error);
160
161
// A version with no error reporting.
162
0
inline bool CUnescape(const GStringPiece& source, string* dest) {
163
0
  return CUnescape(source, dest, NULL);
164
0
}
165
166
// ----------------------------------------------------------------------
167
// CUnescapeForNullTerminatedString()
168
//
169
// This has the same behavior as CUnescape, except that each octal, hex,
170
// or Unicode escape sequence that resolves to a null character ('\0')
171
// is left in its original escaped form.  The result is a
172
// display-formatted string that can be interpreted as a null-terminated
173
// const char* and will not be cut short if it contains embedded null
174
// characters.
175
//
176
// ----------------------------------------------------------------------
177
178
bool CUnescapeForNullTerminatedString(const GStringPiece& source,
179
                                      char* dest,
180
                                      size_t* dest_len,
181
                                      string* error);
182
183
bool CUnescapeForNullTerminatedString(const GStringPiece& source,
184
                                      string* dest,
185
                                      string* error);
186
187
// A version with no error reporting.
188
inline bool CUnescapeForNullTerminatedString(const GStringPiece& source,
189
0
                                             string* dest) {
190
0
  return CUnescapeForNullTerminatedString(source, dest, NULL);
191
0
}
192
193
// ----------------------------------------------------------------------
194
// CEscapeString()
195
// CHexEscapeString()
196
// Utf8SafeCEscapeString()
197
// Utf8SafeCHexEscapeString()
198
//    Copies 'src' to 'dest', escaping dangerous characters using
199
//    C-style escape sequences. This is very useful for preparing query
200
//    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
201
//    hexadecimal rather than octal sequences. The 'Utf8Safe' version
202
//    doesn't touch UTF-8 bytes.
203
//    Returns the number of bytes written to 'dest' (not including the \0)
204
//    or -1 if there was insufficient space.
205
//
206
//    Currently only \n, \r, \t, ", ', \ and !ascii_isprint() chars are escaped.
207
// ----------------------------------------------------------------------
208
size_t CEscapeString(const char* src, size_t src_len, char* dest, size_t dest_len);
209
size_t CHexEscapeString(const char* src, size_t src_len, char* dest, size_t dest_len);
210
size_t Utf8SafeCEscapeString(const char* src, size_t src_len, char* dest, size_t dest_len);
211
size_t Utf8SafeCHexEscapeString(const char* src, size_t src_len, char* dest, size_t dest_len);
212
213
// ----------------------------------------------------------------------
214
// CEscape()
215
// CHexEscape()
216
// Utf8SafeCEscape()
217
// Utf8SafeCHexEscape()
218
//    More convenient form of CEscapeString: returns result as a "string".
219
//    This version is slower than CEscapeString() because it does more
220
//    allocation.  However, it is much more convenient to use in
221
//    non-speed-critical code like logging messages etc.
222
// ----------------------------------------------------------------------
223
string CEscape(const GStringPiece& src);
224
string CHexEscape(const GStringPiece& src);
225
string Utf8SafeCEscape(const GStringPiece& src);
226
string Utf8SafeCHexEscape(const GStringPiece& src);
227
228
// ----------------------------------------------------------------------
229
// BackslashEscape()
230
//    Given a string and a list of characters to escape, replace any
231
//    instance of one of those characters with \ + that character. For
232
//    example, when exporting maps to /varz, label values need to have
233
//    all dots escaped. Appends the result to dest.
234
// BackslashUnescape()
235
//    Replace \ + any of the indicated "unescape me" characters with just
236
//    that character. Appends the result to dest.
237
//
238
//    IMPORTANT:
239
//    This function does not escape \ by default, so if you do not include
240
//    it in the chars to escape you will most certainly get an undesirable
241
//    result. That is, it won't be a reversible operation:
242
//      string src = "foo\\:bar";
243
//      BackslashUnescape(BackslashEscape(src, ":"), ":") == "foo\\\\:bar"
244
//    On the other hand, for all strings "src", the following is true:
245
//      BackslashUnescape(BackslashEscape(src, ":\\"), ":\\") == src
246
// ----------------------------------------------------------------------
247
void BackslashEscape(const GStringPiece& src,
248
                     const strings::CharSet& to_escape,
249
                     string* dest);
250
void BackslashUnescape(const GStringPiece& src,
251
                       const strings::CharSet& to_unescape,
252
                       string* dest);
253
254
inline string BackslashEscape(const GStringPiece& src,
255
0
                              const strings::CharSet& to_escape) {
256
0
  string s;
257
0
  BackslashEscape(src, to_escape, &s);
258
0
  return s;
259
0
}
260
261
inline string BackslashUnescape(const GStringPiece& src,
262
0
                                const strings::CharSet& to_unescape) {
263
0
  string s;
264
0
  BackslashUnescape(src, to_unescape, &s);
265
0
  return s;
266
0
}
267
268
// ----------------------------------------------------------------------
269
// QuotedPrintableUnescape()
270
//    Check out http://www.cis.ohio-state.edu/htbin/rfc/rfc2045.html for
271
//    more details, only briefly implemented. But from the web...
272
//    Quoted-printable is an encoding method defined in the MIME
273
//    standard. It is used primarily to encode 8-bit text (such as text
274
//    that includes foreign characters) into 7-bit US ASCII, creating a
275
//    document that is mostly readable by humans, even in its encoded
276
//    form. All MIME compliant applications can decode quoted-printable
277
//    text, though they may not necessarily be able to properly display the
278
//    document as it was originally intended. As quoted-printable encoding
279
//    is implemented most commonly, printable ASCII characters (values 33
280
//    through 126, excluding 61), tabs and spaces that do not appear at the
281
//    end of lines, and end-of-line characters are not encoded. Other
282
//    characters are represented by an equal sign (=) immediately followed
283
//    by that character's hexadecimal value. Lines that are longer than 76
284
//    characters are shortened by line breaks, with the equal sign marking
285
//    where the breaks occurred.
286
//
287
//    Note that QuotedPrintableUnescape is different from 'Q'-encoding as
288
//    defined in rfc2047. In particular, This does not treat '_'s as spaces.
289
//
290
//    See QEncodingUnescape().
291
//
292
//    Copies "src" to "dest", rewriting quoted printable escape sequences
293
//    =XX to their ASCII equivalents. src is not null terminated, instead
294
//    specify len. I recommend that slen<szdest, but we honor szdest
295
//    anyway.
296
//    RETURNS the length of dest.
297
// ----------------------------------------------------------------------
298
size_t QuotedPrintableUnescape(const char* src, size_t slen, char* dest, size_t szdest);
299
300
// ----------------------------------------------------------------------
301
// QEncodingUnescape()
302
//    This is very similar to QuotedPrintableUnescape except that we convert
303
//    '_'s into spaces. (See RFC 2047)
304
//    http://www.faqs.org/rfcs/rfc2047.html.
305
//
306
//    Copies "src" to "dest", rewriting q-encoding escape sequences
307
//    =XX to their ASCII equivalents. src is not null terminated, instead
308
//    specify len. I recommend that slen<szdest, but we honour szdest
309
//    anyway.
310
//    RETURNS the length of dest.
311
// ----------------------------------------------------------------------
312
size_t QEncodingUnescape(const char* src, size_t slen, char* dest, size_t szdest);
313
314
// ----------------------------------------------------------------------
315
// Base64Unescape()
316
// WebSafeBase64Unescape()
317
//    Copies "src" to "dest", where src is in base64 and is written to its
318
//    ASCII equivalents. src is not null terminated, instead specify len.
319
//    I recommend that slen<szdest, but we honor szdest anyway.
320
//    RETURNS the length of dest, or -1 if src contains invalid chars.
321
//    The WebSafe variation use '-' instead of '+' and '_' instead of '/'.
322
//    The variations that store into a string clear the string first, and
323
//    return false (with dest empty) if src contains invalid chars; for
324
//    these versions src and dest must be different strings.
325
// ----------------------------------------------------------------------
326
size_t Base64Unescape(const char* src, size_t slen, char* dest, size_t szdest);
327
bool Base64Unescape(const char* src, size_t slen, string* dest);
328
1
inline bool Base64Unescape(const string& src, string* dest) {
329
1
  return Base64Unescape(src.data(), src.size(), dest);
330
1
}
331
332
size_t WebSafeBase64Unescape(const char* src, size_t slen, char* dest, size_t szdest);
333
bool WebSafeBase64Unescape(const char* src, size_t slen, string* dest);
334
0
inline bool WebSafeBase64Unescape(const string& src, string* dest) {
335
0
  return WebSafeBase64Unescape(src.data(), src.size(), dest);
336
0
}
337
338
// Return the length to use for the output buffer given to the base64 escape
339
// routines. Make sure to use the same value for do_padding in both.
340
// This function may return incorrect results if given input_len values that
341
// are extremely high, which should happen rarely.
342
size_t CalculateBase64EscapedLen(size_t input_len, bool do_padding);
343
// Use this version when calling Base64Escape without a do_padding arg.
344
size_t CalculateBase64EscapedLen(size_t input_len);
345
346
// ----------------------------------------------------------------------
347
// Base64Escape()
348
// WebSafeBase64Escape()
349
//    Encode "src" to "dest" using base64 encoding.
350
//    src is not null terminated, instead specify len.
351
//    'dest' should have at least CalculateBase64EscapedLen() length.
352
//    RETURNS the length of dest.
353
//    The WebSafe variation use '-' instead of '+' and '_' instead of '/'
354
//    so that we can place the out in the URL or cookies without having
355
//    to escape them.  It also has an extra parameter "do_padding",
356
//    which when set to false will prevent padding with "=".
357
// ----------------------------------------------------------------------
358
size_t Base64Escape(const unsigned char* src, size_t slen, char* dest, size_t szdest);
359
size_t WebSafeBase64Escape(
360
    const unsigned char* src, size_t slen, char* dest, size_t szdest, bool do_padding);
361
// Encode src into dest with padding.
362
void Base64Escape(const string& src, string* dest);
363
// Encode src into dest web-safely without padding.
364
void WebSafeBase64Escape(const string& src, string* dest);
365
// Encode src into dest web-safely with padding.
366
void WebSafeBase64EscapeWithPadding(const string& src, string* dest);
367
368
void Base64Escape(const unsigned char* src, size_t szsrc, string* dest, bool do_padding);
369
void WebSafeBase64Escape(const unsigned char* src, size_t szsrc, string* dest, bool do_padding);
370
371
// ----------------------------------------------------------------------
372
// Base32Unescape()
373
//    Copies "src" to "dest", where src is in base32 and is written to its
374
//    ASCII equivalents. src is not null terminated, instead specify len.
375
//    RETURNS the length of dest, or -1 if src contains invalid chars.
376
// ----------------------------------------------------------------------
377
size_t Base32Unescape(const char* src, size_t slen, char* dest, size_t szdest);
378
bool Base32Unescape(const char* src, size_t slen, string* dest);
379
0
inline bool Base32Unescape(const string& src, string* dest) {
380
0
  return Base32Unescape(src.data(), src.size(), dest);
381
0
}
382
383
// ----------------------------------------------------------------------
384
// Base32Escape()
385
//    Encode "src" to "dest" using base32 encoding.
386
//    src is not null terminated, instead specify len.
387
//    'dest' should have at least CalculateBase32EscapedLen() length.
388
//    RETURNS the length of dest. RETURNS 0 if szsrc is zero, or szdest is
389
//    too small to fit the fully encoded result.  'dest' is padded with '='.
390
//
391
//    Note that this is "Base 32 Encoding" from RFC 4648 section 6.
392
// ----------------------------------------------------------------------
393
size_t Base32Escape(const unsigned char* src, size_t szsrc, char* dest, size_t szdest);
394
bool Base32Escape(const string& src, string* dest);
395
396
// ----------------------------------------------------------------------
397
// Base32HexEscape()
398
//    Encode "src" to "dest" using base32hex encoding.
399
//    src is not null terminated, instead specify len.
400
//    'dest' should have at least CalculateBase32EscapedLen() length.
401
//    RETURNS the length of dest. RETURNS 0 if szsrc is zero, or szdest is
402
//    too small to fit the fully encoded result.  'dest' is padded with '='.
403
//
404
//    Note that this is "Base 32 Encoding with Extended Hex Alphabet"
405
//    from RFC 4648 section 7.
406
// ----------------------------------------------------------------------
407
size_t Base32HexEscape(const unsigned char* src, size_t szsrc, char* dest, size_t szdest);
408
bool Base32HexEscape(const string& src, string* dest);
409
410
// Return the length to use for the output buffer given to the base32 escape
411
// routines.  This function may return incorrect results if given input_len
412
// values that are extremely high, which should happen rarely.
413
size_t CalculateBase32EscapedLen(size_t input_len);
414
415
// ----------------------------------------------------------------------
416
// EightBase32DigitsToTenHexDigits()
417
// TenHexDigitsToEightBase32Digits()
418
//    Convert base32 to and from hex.
419
//
420
//   for EightBase32DigitsToTenHexDigits():
421
//     *in must point to 8 base32 digits.
422
//     *out must point to 10 bytes.
423
//
424
//   for TenHexDigitsToEightBase32Digits():
425
//     *in must point to 10 hex digits.
426
//     *out must point to 8 bytes.
427
//
428
//   Note that the Base64 functions above are different. They convert base64
429
//   to and from binary data. We convert to and from string representations
430
//   of hex. They deal with arbitrary lengths and we deal with single,
431
//   whole base32 quanta.
432
//
433
//   See RFC3548 at http://www.ietf.org/rfc/rfc3548.txt
434
//   for details on base32.
435
// ----------------------------------------------------------------------
436
void EightBase32DigitsToTenHexDigits(const char* in, char* out);
437
void TenHexDigitsToEightBase32Digits(const char* in, char* out);
438
439
// ----------------------------------------------------------------------
440
// EightBase32DigitsToFiveBytes()
441
// FiveBytesToEightBase32Digits()
442
//   Convert base32 to and from binary
443
//
444
//   for EightBase32DigitsToTenHexDigits():
445
//     *in must point to 8 base32 digits.
446
//     *out must point to 5 bytes.
447
//
448
//   for TenHexDigitsToEightBase32Digits():
449
//     *in must point to 5 bytes.
450
//     *out must point to 8 bytes.
451
//
452
//   Note that the Base64 functions above are different.  They deal with
453
//   arbitrary lengths and we deal with single, whole base32 quanta.
454
// ----------------------------------------------------------------------
455
void EightBase32DigitsToFiveBytes(const char* in, unsigned char* bytes_out);
456
void FiveBytesToEightBase32Digits(const unsigned char* in_bytes, char* out);
457
458
// ----------------------------------------------------------------------
459
// EscapeFileName()
460
// UnescapeFileName()
461
//   Utility functions to (un)escape strings to make them suitable for use in
462
//   filenames. Characters not in [a-zA-Z0-9-_.] will be escaped into %XX.
463
//   E.g: "Hello, world!" will be escaped as "Hello%2c%20world%21"
464
//
465
//   NB that this function escapes slashes, so the output will be a flat
466
//   filename and will not keep the directory structure. Slashes are replaced
467
//   with '~', instead of a %XX sequence to make it easier for people to
468
//   understand the escaped form when the original string is a file path.
469
//
470
//   WARNING: filenames produced by these functions may not be compatible with
471
//   Colossus FS. In particular, the '%' character has a special meaning in
472
//   CFS.
473
//
474
//   The versions that receive a string for the output will append to it.
475
// ----------------------------------------------------------------------
476
void EscapeFileName(const GStringPiece& src, string* dst);
477
void UnescapeFileName(const GStringPiece& src, string* dst);
478
0
inline string EscapeFileName(const GStringPiece& src) {
479
0
  string r;
480
0
  EscapeFileName(src, &r);
481
0
  return r;
482
0
}
483
0
inline string UnescapeFileName(const GStringPiece& src) {
484
0
  string r;
485
0
  UnescapeFileName(src, &r);
486
0
  return r;
487
0
}
488
489
// ----------------------------------------------------------------------
490
// Here are a couple utility methods to change ints to hex chars & back
491
// ----------------------------------------------------------------------
492
493
0
inline int int_to_hex_digit(int i) {
494
0
  DCHECK((i >= 0) && (i <= 15));
495
0
  return ((i < 10) ? (i + '0') : ((i - 10) + 'A'));
496
0
}
497
498
0
inline int int_to_lower_hex_digit(int i) {
499
0
  DCHECK((i >= 0) && (i <= 15));
500
0
  return (i < 10) ? (i + '0') : ((i - 10) + 'a');
501
0
}
502
503
0
inline int hex_digit_to_int(char c) {
504
0
  /* Assume ASCII. */
505
0
  DCHECK('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61);
506
0
  DCHECK(ascii_isxdigit(c));
507
0
  int x = static_cast<unsigned char>(c);
508
0
  if (x > '9') {
509
0
    x += 9;
510
0
  }
511
0
  return x & 0xf;
512
0
}
513
514
// ----------------------------------------------------------------------
515
// a2b_hex()
516
//  Description: Ascii-to-Binary hex conversion.  This converts
517
//         2*'num' hexadecimal characters to 'num' binary data.
518
//        Return value: 'num' bytes of binary data (via the 'to' argument)
519
// ----------------------------------------------------------------------
520
void a2b_hex(const char* from, unsigned char* to, size_t num);
521
void a2b_hex(const char* from, char* to, size_t num);
522
void a2b_hex(const char* from, string* to, size_t num);
523
string a2b_hex(const string& a);
524
525
// ----------------------------------------------------------------------
526
// a2b_bin()
527
//  Description: Ascii-to-Binary binary conversion.  This converts
528
//        a.size() binary characters (ascii '0' or '1') to
529
//        ceil(a.size()/8) bytes of binary data.  The first character is
530
//        considered the most significant if byte_order_msb is set.  a is
531
//        considered to be padded with trailing 0s if its size is not a
532
//        multiple of 8.
533
//        Return value: ceil(a.size()/8) bytes of binary data
534
// ----------------------------------------------------------------------
535
string a2b_bin(const string& a, bool byte_order_msb);
536
537
// ----------------------------------------------------------------------
538
// b2a_hex()
539
//  Description: Binary-to-Ascii hex conversion.  This converts
540
//   'num' bytes of binary to a 2*'num'-character hexadecimal representation
541
//    Return value: 2*'num' characters of ascii text (via the 'to' argument)
542
// ----------------------------------------------------------------------
543
void b2a_hex(const unsigned char* from, char* to, size_t num);
544
void b2a_hex(const unsigned char* from, string* to, size_t num);
545
546
// ----------------------------------------------------------------------
547
// b2a_hex()
548
//  Description: Binary-to-Ascii hex conversion.  This converts
549
//   'num' bytes of binary to a 2*'num'-character hexadecimal representation
550
//    Return value: 2*'num' characters of ascii string
551
// ----------------------------------------------------------------------
552
string b2a_hex(const char* from, size_t num);
553
string b2a_hex(const GStringPiece& b);
554
555
// ----------------------------------------------------------------------
556
// b2a_bin()
557
//  Description: Binary-to-Ascii binary conversion.  This converts
558
//   b.size() bytes of binary to a 8*b.size() character representation
559
//   (ascii '0' or '1').  The highest order bit in each byte is returned
560
//   first in the string if byte_order_msb is set.
561
//   Return value: 8*b.size() characters of ascii text
562
// ----------------------------------------------------------------------
563
string b2a_bin(const string& b, bool byte_order_msb);
564
565
// ----------------------------------------------------------------------
566
// ShellEscape
567
//   Make a shell command argument from a string.
568
//   Returns a Bourne shell string literal such that, once the shell finishes
569
//   expanding the argument, the argument passed on to the program being
570
//   run will be the same as whatever you passed in.
571
//   NOTE: This is "ported" from python2.2's commands.mkarg(); it should be
572
//         safe for Bourne shell syntax (i.e. sh, bash), but mileage may vary
573
//         with other shells.
574
// ----------------------------------------------------------------------
575
string ShellEscape(GStringPiece src);
576
577
// Runs ShellEscape() on the arguments, concatenates them with a space, and
578
// returns the resulting string.
579
template <class InputIterator>
580
string ShellEscapeCommandLine(InputIterator begin, const InputIterator& end) {
581
  string result;
582
  for (; begin != end; ++begin) {
583
    if (!result.empty()) result.append(" ");
584
    result.append(ShellEscape(*begin));
585
  }
586
  return result;
587
}
588
589
// Reads at most bytes_to_read from binary_string and writes it to
590
// ascii_string in lower case hex.
591
void ByteStringToAscii(const string& binary_string, size_t bytes_to_read,
592
                       string* ascii_string);
593
594
inline string ByteStringToAscii(const string& binary_string,
595
0
                                size_t bytes_to_read) {
596
0
  string result;
597
0
  ByteStringToAscii(binary_string, bytes_to_read, &result);
598
0
  return result;
599
0
}
600
601
// Converts the hex from ascii_string into binary data and
602
// writes the binary data into binary_string.
603
// Empty input successfully converts to empty output.
604
// Returns false and may modify output if it is
605
// unable to parse the hex string.
606
bool ByteStringFromAscii(const string& ascii_string, string* binary_string);
607
608
// Clean up a multi-line string to conform to Unix line endings.
609
// Reads from src and appends to dst, so usually dst should be empty.
610
// If there is no line ending at the end of a non-empty string, it can
611
// be added automatically.
612
//
613
// Four different types of input are correctly handled:
614
//
615
//   - Unix/Linux files: line ending is LF, pass through unchanged
616
//
617
//   - DOS/Windows files: line ending is CRLF: convert to LF
618
//
619
//   - Legacy Mac files: line ending is CR: convert to LF
620
//
621
//   - Garbled files: random line endings, covert gracefully
622
//                    lonely CR, lonely LF, CRLF: convert to LF
623
//
624
//   @param src The multi-line string to convert
625
//   @param dst The converted string is appended to this string
626
//   @param auto_end_last_line Automatically terminate the last line
627
//
628
//   Limitations:
629
//
630
//     This does not do the right thing for CRCRLF files created by
631
//     broken programs that do another Unix->DOS conversion on files
632
//     that are already in CRLF format.
633
void CleanStringLineEndings(const string& src, string* dst,
634
                            bool auto_end_last_line);
635
636
// Same as above, but transforms the argument in place.
637
void CleanStringLineEndings(string* str, bool auto_end_last_line);
638
639
}  // namespace strings
640
641
// The following functions used to be defined in strutil.h in the top-level
642
// namespace, so we alias them here. Do not add new functions here.
643
//
644
//             Talk to him if you want to help.
645
//
646
// DEPRECATED(mec): Using these names in the global namespace is deprecated.
647
// Use the strings:: names.
648
649
using strings::EscapeStrForCSV;
650
using strings::UnescapeCEscapeSequences;
651
using strings::UnescapeCEscapeString;
652
using strings::CEscapeString;
653
using strings::CHexEscapeString;
654
using strings::CEscape;
655
using strings::CHexEscape;
656
using strings::BackslashEscape;
657
using strings::BackslashUnescape;
658
using strings::QuotedPrintableUnescape;
659
using strings::QEncodingUnescape;
660
using strings::Base64Unescape;
661
using strings::WebSafeBase64Unescape;
662
using strings::CalculateBase64EscapedLen;
663
using strings::Base64Escape;
664
using strings::WebSafeBase64Escape;
665
using strings::WebSafeBase64EscapeWithPadding;
666
using strings::Base32Escape;
667
using strings::Base32HexEscape;
668
using strings::CalculateBase32EscapedLen;
669
using strings::EightBase32DigitsToTenHexDigits;
670
using strings::TenHexDigitsToEightBase32Digits;
671
using strings::EightBase32DigitsToFiveBytes;
672
using strings::FiveBytesToEightBase32Digits;
673
using strings::int_to_hex_digit;
674
using strings::int_to_lower_hex_digit;
675
using strings::hex_digit_to_int;
676
using strings::a2b_hex;
677
using strings::a2b_bin;
678
using strings::b2a_hex;
679
using strings::b2a_bin;
680
using strings::ShellEscape;
681
using strings::ShellEscapeCommandLine;
682
using strings::ByteStringFromAscii;
683
using strings::ByteStringToAscii;
684
using strings::CleanStringLineEndings;
685
686
#endif  // YB_GUTIL_STRINGS_ESCAPING_H