YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/gutil/strings/escaping.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2008 Google Inc. All Rights Reserved.
2
// Authors: Numerous. See the .h for contact people.
3
//
4
// The following only applies to changes made to this file as part of YugaByte development.
5
//
6
// Portions Copyright (c) YugaByte, Inc.
7
//
8
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
9
// in compliance with the License.  You may obtain a copy of the License at
10
//
11
// http://www.apache.org/licenses/LICENSE-2.0
12
//
13
// Unless required by applicable law or agreed to in writing, software distributed under the License
14
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
15
// or implied.  See the License for the specific language governing permissions and limitations
16
// under the License.
17
//
18
19
#include "yb/gutil/strings/escaping.h"
20
21
#include <assert.h>
22
#include <stdio.h>
23
#include <string.h>
24
25
#include <limits>
26
#include <memory>
27
#include <vector>
28
29
#include "yb/gutil/charmap.h"
30
#include "yb/gutil/integral_types.h"
31
#include "yb/gutil/port.h"
32
#include "yb/gutil/stl_util.h"
33
#include "yb/gutil/strings/join.h"
34
#include "yb/gutil/utf/utf.h"  // for runetochar
35
36
using std::numeric_limits;
37
using std::vector;
38
39
40
namespace strings {
41
42
// These are used for the leave_nulls_escaped argument to CUnescapeInternal().
43
static bool kUnescapeNulls = false;
44
static bool kLeaveNullsEscaped = true;
45
46
// ----------------------------------------------------------------------
47
// EscapeStrForCSV()
48
//    Escapes the quotes in 'src' by doubling them. This is necessary
49
//    for generating CSV files (see SplitCSVLine).
50
//    Returns the number of characters written into dest (not counting
51
//    the \0) or -1 if there was insufficient space. Dest could end up
52
//    twice as long as src.
53
//
54
//    Example: [some "string" to test] --> [some ""string"" to test]
55
// ----------------------------------------------------------------------
56
0
size_t EscapeStrForCSV(const char* src, char* dest, size_t dest_len) {
57
0
  size_t used = 0;
58
59
0
  while (true) {
60
0
    if (*src == '\0' && used < dest_len) {
61
0
      dest[used] = '\0';
62
0
      return used;
63
0
    }
64
65
0
    if (used + 1 >= dest_len)  // +1 because we might require two characters
66
0
      return -1;
67
68
0
    if (*src == '"')
69
0
      dest[used++] = '"';
70
71
0
    dest[used++] = *src++;
72
0
  }
73
0
}
74
75
// ----------------------------------------------------------------------
76
// UnescapeCEscapeSequences()
77
//    This does all the unescaping that C does: \ooo, \r, \n, etc
78
//    Returns length of resulting string.
79
//    The implementation of \x parses any positive number of hex digits,
80
//    but it is an error if the value requires more than 8 bits, and the
81
//    result is truncated to 8 bits. The same is true for octals.
82
//
83
//    The second call stores its errors in a supplied string vector.
84
//    If the string vector pointer is NULL, it reports the errors with LOG().
85
//
86
//    *** DEPRECATED: Use CUnescape() in new code ***
87
//
88
//    NOTE: any changes to this function must also be reflected in the newer
89
//    CUnescape().
90
// ----------------------------------------------------------------------
91
92
0
#define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
93
94
0
size_t UnescapeCEscapeSequences(const char* source, char* dest) {
95
0
  return UnescapeCEscapeSequences(source, dest, nullptr);
96
0
}
97
98
0
size_t UnescapeCEscapeSequences(const char* source, char* dest, vector<string> *errors) {
99
0
  char* d = dest;
100
0
  const char* p = source;
101
102
  // Small optimization for case where source = dest and there's no escaping
103
0
  while ( p == d && *p != '\0' && *p != '\\' )
104
0
    p++, d++;
105
106
0
  while (*p != '\0') {
107
0
    if (*p != '\\') {
108
0
      *d++ = *p++;
109
0
    } else {
110
0
      switch ( *++p ) {                    // skip past the '\\'
111
0
        case '\0':
112
0
          LOG_STRING(ERROR, errors) << "String cannot end with \\";
113
0
          *d = '\0';
114
0
          return d - dest;   // we're done with p
115
0
        case 'a':  *d++ = '\a';  break;
116
0
        case 'b':  *d++ = '\b';  break;
117
0
        case 'f':  *d++ = '\f';  break;
118
0
        case 'n':  *d++ = '\n';  break;
119
0
        case 'r':  *d++ = '\r';  break;
120
0
        case 't':  *d++ = '\t';  break;
121
0
        case 'v':  *d++ = '\v';  break;
122
0
        case '\\': *d++ = '\\';  break;
123
0
        case '?':  *d++ = '\?';  break;    // \?  Who knew?
124
0
        case '\'': *d++ = '\'';  break;
125
0
        case '"':  *d++ = '\"';  break;
126
0
        case '0': case '1': case '2': case '3':  // octal digit: 1 to 3 digits
127
0
        case '4': case '5': case '6': case '7': {
128
0
          const char *octal_start = p;
129
0
          unsigned int ch = *p - '0';
130
0
          if ( IS_OCTAL_DIGIT(p[1]) )
131
0
            ch = ch * 8 + *++p - '0';
132
0
          if ( IS_OCTAL_DIGIT(p[1]) )      // safe (and easy) to do this twice
133
0
            ch = ch * 8 + *++p - '0';      // now points at last digit
134
0
          if (ch > 0xFF)
135
0
            LOG_STRING(ERROR, errors) << "Value of " <<
136
0
              "\\" << string(octal_start, p+1-octal_start) <<
137
0
              " exceeds 8 bits";
138
0
          *d++ = ch;
139
0
          break;
140
0
        }
141
0
        case 'x': case 'X': {
142
0
          if (!ascii_isxdigit(p[1])) {
143
0
            if (p[1] == '\0') {
144
0
              LOG_STRING(ERROR, errors) << "String cannot end with \\x";
145
0
            } else {
146
0
              LOG_STRING(ERROR, errors) <<
147
0
                "\\x cannot be followed by a non-hex digit: \\" << *p << p[1];
148
0
            }
149
0
            break;
150
0
          }
151
0
          unsigned int ch = 0;
152
0
          const char *hex_start = p;
153
0
          while (ascii_isxdigit(p[1]))  // arbitrarily many hex digits
154
0
            ch = (ch << 4) + hex_digit_to_int(*++p);
155
0
          if (ch > 0xFF)
156
0
            LOG_STRING(ERROR, errors) << "Value of " <<
157
0
              "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
158
0
          *d++ = ch;
159
0
          break;
160
0
        }
161
0
        case 'u': {
162
          // \uhhhh => convert 4 hex digits to UTF-8
163
0
          char32 rune = 0;
164
0
          const char *hex_start = p;
165
0
          for (int i = 0; i < 4; ++i) {
166
0
            if (ascii_isxdigit(p[1])) {  // Look one char ahead.
167
0
              rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
168
0
            } else {
169
0
              LOG_STRING(ERROR, errors)
170
0
                << "\\u must be followed by 4 hex digits: \\"
171
0
                <<  string(hex_start, p+1-hex_start);
172
0
              break;
173
0
            }
174
0
          }
175
0
          d += runetochar(d, &rune);
176
0
          break;
177
0
        }
178
0
        case 'U': {
179
          // \Uhhhhhhhh => convert 8 hex digits to UTF-8
180
0
          char32 rune = 0;
181
0
          const char *hex_start = p;
182
0
          for (int i = 0; i < 8; ++i) {
183
0
            if (ascii_isxdigit(p[1])) {  // Look one char ahead.
184
              // Don't change rune until we're sure this
185
              // is within the Unicode limit, but do advance p.
186
0
              char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
187
0
              if (newrune > 0x10FFFF) {
188
0
                LOG_STRING(ERROR, errors)
189
0
                  << "Value of \\"
190
0
                  << string(hex_start, p + 1 - hex_start)
191
0
                  << " exceeds Unicode limit (0x10FFFF)";
192
0
                break;
193
0
              } else {
194
0
                rune = newrune;
195
0
              }
196
0
            } else {
197
0
              LOG_STRING(ERROR, errors)
198
0
                << "\\U must be followed by 8 hex digits: \\"
199
0
                <<  string(hex_start, p+1-hex_start);
200
0
              break;
201
0
            }
202
0
          }
203
0
          d += runetochar(d, &rune);
204
0
          break;
205
0
        }
206
0
        default:
207
0
          LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
208
0
      }
209
0
      p++;                                 // read past letter we escaped
210
0
    }
211
0
  }
212
0
  *d = '\0';
213
0
  return d - dest;
214
0
}
215
216
// ----------------------------------------------------------------------
217
// UnescapeCEscapeString()
218
//    This does the same thing as UnescapeCEscapeSequences, but creates
219
//    a new string. The caller does not need to worry about allocating
220
//    a dest buffer. This should be used for non performance critical
221
//    tasks such as printing debug messages. It is safe for src and dest
222
//    to be the same.
223
//
224
//    The second call stores its errors in a supplied string vector.
225
//    If the string vector pointer is NULL, it reports the errors with LOG().
226
//
227
//    In the first and second calls, the length of dest is returned. In the
228
//    the third call, the new string is returned.
229
//
230
//    *** DEPRECATED: Use CUnescape() in new code ***
231
//
232
// ----------------------------------------------------------------------
233
0
size_t UnescapeCEscapeString(const string& src, string* dest) {
234
0
  return UnescapeCEscapeString(src, dest, nullptr);
235
0
}
236
237
0
size_t UnescapeCEscapeString(const string& src, string* dest, vector<string> *errors) {
238
0
  CHECK(dest);
239
0
  dest->resize(src.size() + 1);
240
0
  auto len = UnescapeCEscapeSequences(src.c_str(), const_cast<char*>(dest->data()), errors);
241
0
  dest->resize(len);
242
0
  return len;
243
0
}
244
245
0
string UnescapeCEscapeString(const string& src) {
246
0
  std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
247
0
  auto len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
248
0
  return string(unescaped.get(), len);
249
0
}
250
251
// ----------------------------------------------------------------------
252
// CUnescapeInternal()
253
//    Implements both CUnescape() and CUnescapeForNullTerminatedString().
254
//
255
//    Unescapes C escape sequences and is the reverse of CEscape().
256
//
257
//    If 'source' is valid, stores the unescaped string and its size in
258
//    'dest' and 'dest_len' respectively, and returns true. Otherwise
259
//    returns false and optionally stores the error description in
260
//    'error'. Set 'error' to NULL to disable error reporting.
261
//
262
//    'dest' should point to a buffer that is at least as big as 'source'.
263
//    'source' and 'dest' may be the same.
264
//
265
//     NOTE: any changes to this function must also be reflected in the older
266
//     UnescapeCEscapeSequences().
267
// ----------------------------------------------------------------------
268
static bool CUnescapeInternal(const GStringPiece& source,
269
                              bool leave_nulls_escaped,
270
                              char* dest,
271
                              size_t* dest_len,
272
0
                              string* error) {
273
0
  char* d = dest;
274
0
  const char* p = source.data();
275
0
  const char* end = source.end();
276
0
  const char* last_byte = end - 1;
277
278
  // Small optimization for case where source = dest and there's no escaping
279
0
  while (p == d && p < end && *p != '\\')
280
0
    p++, d++;
281
282
0
  while (p < end) {
283
0
    if (*p != '\\') {
284
0
      *d++ = *p++;
285
0
    } else {
286
0
      if (++p > last_byte) {       // skip past the '\\'
287
0
        if (error) *error = "String cannot end with \\";
288
0
        return false;
289
0
      }
290
0
      switch (*p) {
291
0
        case 'a':  *d++ = '\a';  break;
292
0
        case 'b':  *d++ = '\b';  break;
293
0
        case 'f':  *d++ = '\f';  break;
294
0
        case 'n':  *d++ = '\n';  break;
295
0
        case 'r':  *d++ = '\r';  break;
296
0
        case 't':  *d++ = '\t';  break;
297
0
        case 'v':  *d++ = '\v';  break;
298
0
        case '\\': *d++ = '\\';  break;
299
0
        case '?':  *d++ = '\?';  break;    // \?  Who knew?
300
0
        case '\'': *d++ = '\'';  break;
301
0
        case '"':  *d++ = '\"';  break;
302
0
        case '0': case '1': case '2': case '3':  // octal digit: 1 to 3 digits
303
0
        case '4': case '5': case '6': case '7': {
304
0
          const char *octal_start = p;
305
0
          unsigned int ch = *p - '0';
306
0
          if (p < last_byte && IS_OCTAL_DIGIT(p[1]))
307
0
            ch = ch * 8 + *++p - '0';
308
0
          if (p < last_byte && IS_OCTAL_DIGIT(p[1]))
309
0
            ch = ch * 8 + *++p - '0';      // now points at last digit
310
0
          if (ch > 0xff) {
311
0
            if (error) {
312
0
              *error = "Value of \\" +
313
0
                  string(octal_start, p + 1 - octal_start) +
314
0
                  " exceeds 0xff";
315
0
            }
316
0
            return false;
317
0
          }
318
0
          if ((ch == 0) && leave_nulls_escaped) {
319
            // Copy the escape sequence for the null character
320
0
            const size_t octal_size = p + 1 - octal_start;
321
0
            *d++ = '\\';
322
0
            memcpy(d, octal_start, octal_size);
323
0
            d += octal_size;
324
0
            break;
325
0
          }
326
0
          *d++ = ch;
327
0
          break;
328
0
        }
329
0
        case 'x': case 'X': {
330
0
          if (p >= last_byte) {
331
0
            if (error) *error = "String cannot end with \\x";
332
0
            return false;
333
0
          } else if (!ascii_isxdigit(p[1])) {
334
0
            if (error) *error = "\\x cannot be followed by a non-hex digit";
335
0
            return false;
336
0
          }
337
0
          unsigned int ch = 0;
338
0
          const char *hex_start = p;
339
0
          while (p < last_byte && ascii_isxdigit(p[1]))
340
            // Arbitrarily many hex digits
341
0
            ch = (ch << 4) + hex_digit_to_int(*++p);
342
0
          if (ch > 0xFF) {
343
0
            if (error) {
344
0
              *error = "Value of \\" + string(hex_start, p + 1 - hex_start) +
345
0
                  " exceeds 0xff";
346
0
            }
347
0
            return false;
348
0
          }
349
0
          if ((ch == 0) && leave_nulls_escaped) {
350
            // Copy the escape sequence for the null character
351
0
            const size_t hex_size = p + 1 - hex_start;
352
0
            *d++ = '\\';
353
0
            memcpy(d, hex_start, hex_size);
354
0
            d += hex_size;
355
0
            break;
356
0
          }
357
0
          *d++ = ch;
358
0
          break;
359
0
        }
360
0
        case 'u': {
361
          // \uhhhh => convert 4 hex digits to UTF-8
362
0
          char32 rune = 0;
363
0
          const char *hex_start = p;
364
0
          if (p + 4 >= end) {
365
0
            if (error) {
366
0
              *error = "\\u must be followed by 4 hex digits: \\" +
367
0
                  string(hex_start, p + 1 - hex_start);
368
0
            }
369
0
            return false;
370
0
          }
371
0
          for (int i = 0; i < 4; ++i) {
372
            // Look one char ahead.
373
0
            if (ascii_isxdigit(p[1])) {
374
0
              rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
375
0
            } else {
376
0
              if (error) {
377
0
                *error = "\\u must be followed by 4 hex digits: \\" +
378
0
                    string(hex_start, p + 1 - hex_start);
379
0
              }
380
0
              return false;
381
0
            }
382
0
          }
383
0
          if ((rune == 0) && leave_nulls_escaped) {
384
            // Copy the escape sequence for the null character
385
0
            *d++ = '\\';
386
0
            memcpy(d, hex_start, 5);  // u0000
387
0
            d += 5;
388
0
            break;
389
0
          }
390
0
          d += runetochar(d, &rune);
391
0
          break;
392
0
        }
393
0
        case 'U': {
394
          // \Uhhhhhhhh => convert 8 hex digits to UTF-8
395
0
          char32 rune = 0;
396
0
          const char *hex_start = p;
397
0
          if (p + 8 >= end) {
398
0
            if (error) {
399
0
              *error = "\\U must be followed by 8 hex digits: \\" +
400
0
                  string(hex_start, p + 1 - hex_start);
401
0
            }
402
0
            return false;
403
0
          }
404
0
          for (int i = 0; i < 8; ++i) {
405
            // Look one char ahead.
406
0
            if (ascii_isxdigit(p[1])) {
407
              // Don't change rune until we're sure this
408
              // is within the Unicode limit, but do advance p.
409
0
              char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
410
0
              if (newrune > 0x10FFFF) {
411
0
                if (error) {
412
0
                  *error = "Value of \\" +
413
0
                      string(hex_start, p + 1 - hex_start) +
414
0
                      " exceeds Unicode limit (0x10FFFF)";
415
0
                }
416
0
                return false;
417
0
              } else {
418
0
                rune = newrune;
419
0
              }
420
0
            } else {
421
0
              if (error) {
422
0
                *error = "\\U must be followed by 8 hex digits: \\" +
423
0
                    string(hex_start, p + 1 - hex_start);
424
0
              }
425
0
              return false;
426
0
            }
427
0
          }
428
0
          if ((rune == 0) && leave_nulls_escaped) {
429
            // Copy the escape sequence for the null character
430
0
            *d++ = '\\';
431
0
            memcpy(d, hex_start, 9);  // U00000000
432
0
            d += 9;
433
0
            break;
434
0
          }
435
0
          d += runetochar(d, &rune);
436
0
          break;
437
0
        }
438
0
        default: {
439
0
          if (error) *error = string("Unknown escape sequence: \\") + *p;
440
0
          return false;
441
0
        }
442
0
      }
443
0
      p++;                                 // read past letter we escaped
444
0
    }
445
0
  }
446
0
  *dest_len = d - dest;
447
0
  return true;
448
0
}
449
450
// ----------------------------------------------------------------------
451
// CUnescapeInternal()
452
//
453
//    Same as above but uses a C++ string for output. 'source' and 'dest'
454
//    may be the same.
455
// ----------------------------------------------------------------------
456
bool CUnescapeInternal(const GStringPiece& source,
457
                       bool leave_nulls_escaped,
458
                       string* dest,
459
0
                       string* error) {
460
0
  dest->resize(source.size());
461
0
  size_t dest_size;
462
0
  if (!CUnescapeInternal(source,
463
0
                         leave_nulls_escaped,
464
0
                         const_cast<char*>(dest->data()),
465
0
                         &dest_size,
466
0
                         error)) {
467
0
    return false;
468
0
  }
469
0
  dest->resize(dest_size);
470
0
  return true;
471
0
}
472
473
// ----------------------------------------------------------------------
474
// CUnescape()
475
//
476
// See CUnescapeInternal() for implementation details.
477
// ----------------------------------------------------------------------
478
0
bool CUnescape(const GStringPiece& source, char* dest, size_t* dest_len, string* error) {
479
0
  return CUnescapeInternal(source, kUnescapeNulls, dest, dest_len, error);
480
0
}
481
482
0
bool CUnescape(const GStringPiece& source, string* dest, string* error) {
483
0
  return CUnescapeInternal(source, kUnescapeNulls, dest, error);
484
0
}
485
486
// ----------------------------------------------------------------------
487
// CUnescapeForNullTerminatedString()
488
//
489
// See CUnescapeInternal() for implementation details.
490
// ----------------------------------------------------------------------
491
bool CUnescapeForNullTerminatedString(const GStringPiece& source,
492
                                      char* dest,
493
                                      size_t* dest_len,
494
0
                                      string* error) {
495
0
  return CUnescapeInternal(source, kLeaveNullsEscaped, dest, dest_len, error);
496
0
}
497
498
bool CUnescapeForNullTerminatedString(const GStringPiece& source,
499
                                      string* dest,
500
0
                                      string* error) {
501
0
  return CUnescapeInternal(source, kLeaveNullsEscaped, dest, error);
502
0
}
503
504
// ----------------------------------------------------------------------
505
// CEscapeString()
506
// CHexEscapeString()
507
// Utf8SafeCEscapeString()
508
// Utf8SafeCHexEscapeString()
509
//    Copies 'src' to 'dest', escaping dangerous characters using
510
//    C-style escape sequences. This is very useful for preparing query
511
//    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
512
//    hexadecimal rather than octal sequences. The 'Utf8Safe' version doesn't
513
//    touch UTF-8 bytes.
514
//    Returns the number of bytes written to 'dest' (not including the \0)
515
//    or -1 if there was insufficient space.
516
//
517
//    Currently only \n, \r, \t, ", ', \ and !ascii_isprint() chars are escaped.
518
// ----------------------------------------------------------------------
519
size_t CEscapeInternal(
520
5
    const char* src, size_t src_len, char* dest, size_t dest_len, bool use_hex, bool utf8_safe) {
521
5
  const char* src_end = src + src_len;
522
5
  size_t used = 0;
523
5
  bool last_hex_escape = false;  // true if last output char was \xNN
524
525
56
  for (; src < src_end; 
src++51
) {
526
51
    if (dest_len - used < 2)   // Need space for two letter escape
527
0
      return -1;
528
529
51
    bool is_hex_escape = false;
530
51
    unsigned char cur = *src;
531
51
    switch (cur) {
532
0
      case '\n': dest[used++] = '\\'; dest[used++] = 'n';  break;
533
0
      case '\r': dest[used++] = '\\'; dest[used++] = 'r';  break;
534
0
      case '\t': dest[used++] = '\\'; dest[used++] = 't';  break;
535
0
      case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
536
0
      case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
537
0
      case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
538
51
      default:
539
        // Note that if we emit \xNN and the src character after that is a hex
540
        // digit then that digit must be escaped too to prevent it being
541
        // interpreted as part of the character code by C.
542
51
        if ((!utf8_safe || 
cur < 0x8040
) &&
543
51
            
(49
!ascii_isprint(cur)49
||
544
49
             (last_hex_escape && 
ascii_isxdigit(cur)0
))) {
545
0
          if (dest_len - used < 4)  // need space for 4 letter escape
546
0
            return -1;
547
0
          snprintf(dest + used, dest_len - used, (use_hex ? "\\x%02x" : "\\%03o"), cur);
548
0
          is_hex_escape = use_hex;
549
0
          used += 4;
550
51
        } else {
551
51
          dest[used++] = cur;
552
51
          break;
553
51
        }
554
51
    }
555
51
    last_hex_escape = is_hex_escape;
556
51
  }
557
558
5
  if (dest_len - used < 1)   // make sure that there is room for \0
559
0
    return -1;
560
561
5
  dest[used] = '\0';   // doesn't count towards return value though
562
5
  return used;
563
5
}
564
565
0
size_t CEscapeString(const char* src, size_t src_len, char* dest, size_t dest_len) {
566
0
  return CEscapeInternal(src, src_len, dest, dest_len, false, false);
567
0
}
568
569
0
size_t CHexEscapeString(const char* src, size_t src_len, char* dest, size_t dest_len) {
570
0
  return CEscapeInternal(src, src_len, dest, dest_len, true, false);
571
0
}
572
573
0
size_t Utf8SafeCEscapeString(const char* src, size_t src_len, char* dest, size_t dest_len) {
574
0
  return CEscapeInternal(src, src_len, dest, dest_len, false, true);
575
0
}
576
577
0
size_t Utf8SafeCHexEscapeString(const char* src, size_t src_len, char* dest, size_t dest_len) {
578
0
  return CEscapeInternal(src, src_len, dest, dest_len, true, true);
579
0
}
580
581
// ----------------------------------------------------------------------
582
// CEscape()
583
// CHexEscape()
584
// Utf8SafeCEscape()
585
// Utf8SafeCHexEscape()
586
//    Copies 'src' to result, escaping dangerous characters using
587
//    C-style escape sequences. This is very useful for preparing query
588
//    flags. 'src' and 'dest' should not overlap. The 'Hex' version
589
//    hexadecimal rather than octal sequences. The 'Utf8Safe' version
590
//    doesn't touch UTF-8 bytes.
591
//
592
//    Currently only \n, \r, \t, ", ', \ and !ascii_isprint() chars are escaped.
593
// ----------------------------------------------------------------------
594
0
string CEscape(const GStringPiece& src) {
595
0
  const auto dest_length = src.size() * 4 + 1;  // Maximum possible expansion
596
0
  std::unique_ptr<char[]> dest(new char[dest_length]);
597
0
  const auto len = CEscapeInternal(src.data(), src.size(),
598
0
                                   dest.get(), dest_length, false, false);
599
0
  DCHECK_GE(len, 0);
600
0
  return string(dest.get(), len);
601
0
}
602
603
1
string CHexEscape(const GStringPiece& src) {
604
1
  const auto dest_length = src.size() * 4 + 1;  // Maximum possible expansion
605
1
  std::unique_ptr<char[]> dest(new char[dest_length]);
606
1
  const auto len = CEscapeInternal(src.data(), src.size(),
607
1
                                   dest.get(), dest_length, true, false);
608
1
  DCHECK_GE(len, 0);
609
1
  return string(dest.get(), len);
610
1
}
611
612
4
string Utf8SafeCEscape(const GStringPiece& src) {
613
4
  const auto dest_length = src.size() * 4 + 1;  // Maximum possible expansion
614
4
  std::unique_ptr<char[]> dest(new char[dest_length]);
615
4
  const auto len = CEscapeInternal(src.data(), src.size(),
616
4
                                   dest.get(), dest_length, false, true);
617
4
  DCHECK_GE(len, 0);
618
4
  return string(dest.get(), len);
619
4
}
620
621
0
string Utf8SafeCHexEscape(const GStringPiece& src) {
622
0
  const auto dest_length = src.size() * 4 + 1;  // Maximum possible expansion
623
0
  std::unique_ptr<char[]> dest(new char[dest_length]);
624
0
  const auto len = CEscapeInternal(src.data(), src.size(),
625
0
                                   dest.get(), dest_length, true, true);
626
0
  DCHECK_GE(len, 0);
627
0
  return string(dest.get(), len);
628
0
}
629
630
// ----------------------------------------------------------------------
631
// BackslashEscape and BackslashUnescape
632
// ----------------------------------------------------------------------
633
void BackslashEscape(const GStringPiece& src,
634
                     const strings::CharSet& to_escape,
635
0
                     string* dest) {
636
0
  dest->reserve(dest->size() + src.size());
637
0
  for (const char *p = src.data(), *end = src.data() + src.size();
638
0
       p != end; ) {
639
    // Advance to next character we need to escape, or to end of source
640
0
    const char* next = p;
641
0
    while (next < end && !to_escape.Test(*next)) {
642
0
      next++;
643
0
    }
644
    // Append the whole run of non-escaped chars
645
0
    dest->append(p, next - p);
646
0
    if (next == end) break;
647
    // Char at *next needs to be escaped.  Append backslash followed by *next
648
0
    char c[2];
649
0
    c[0] = '\\';
650
0
    c[1] = *next;
651
0
    dest->append(c, 2);
652
0
    p = next + 1;
653
0
  }
654
0
}
655
656
void BackslashUnescape(const GStringPiece& src,
657
                       const strings::CharSet& to_unescape,
658
0
                       string* dest) {
659
0
  dest->reserve(dest->size() + src.size());
660
0
  bool escaped = false;
661
0
  for (const char* p = src.data(), *end = src.data() + src.size();
662
0
       p != end; ++p) {
663
0
    if (escaped) {
664
0
      if (!to_unescape.Test(*p)) {
665
        // Keep the backslash
666
0
        dest->push_back('\\');
667
0
      }
668
0
      dest->push_back(*p);
669
0
      escaped = false;
670
0
    } else if (*p == '\\') {
671
0
      escaped = true;
672
0
    } else {
673
0
      dest->push_back(*p);
674
0
    }
675
0
  }
676
0
}
677
678
// ----------------------------------------------------------------------
679
// int QuotedPrintableUnescape()
680
//
681
// Check out http://www.cis.ohio-state.edu/htbin/rfc/rfc2045.html for
682
// more details, only briefly implemented. But from the web...
683
// Quoted-printable is an encoding method defined in the MIME
684
// standard. It is used primarily to encode 8-bit text (such as text
685
// that includes foreign characters) into 7-bit US ASCII, creating a
686
// document that is mostly readable by humans, even in its encoded
687
// form. All MIME compliant applications can decode quoted-printable
688
// text, though they may not necessarily be able to properly display the
689
// document as it was originally intended. As quoted-printable encoding
690
// is implemented most commonly, printable ASCII characters (values 33
691
// through 126, excluding 61), tabs and spaces that do not appear at the
692
// end of lines, and end-of-line characters are not encoded. Other
693
// characters are represented by an equal sign (=) immediately followed
694
// by that character's hexadecimal value. Lines that are longer than 76
695
// characters are shortened by line breaks, with the equal sign marking
696
// where the breaks occurred.
697
//
698
// Note that QuotedPrintableUnescape is different from 'Q'-encoding as
699
// defined in rfc2047. In particular, This does not treat '_'s as spaces.
700
// See QEncodingUnescape().
701
// ----------------------------------------------------------------------
702
703
0
size_t QuotedPrintableUnescape(const char *source, size_t slen, char *dest, size_t szdest) {
704
0
  char* d = dest;
705
0
  const char* p = source;
706
707
0
  while ( p < source+slen && *p != '\0' && d < dest+szdest ) {
708
0
    switch (*p) {
709
0
      case '=':
710
        // If it's valid, convert to hex and insert or remove line-wrap.
711
        // In the case of line-wrap removal, we allow LF as well as CRLF.
712
0
        if ( p < source + slen - 1 ) {
713
0
          if ( p[1] == '\n' ) {
714
0
            p++;
715
0
          } else if ( p < source + slen - 2 ) {
716
0
            if ( ascii_isxdigit(p[1]) && ascii_isxdigit(p[2]) ) {
717
0
              *d++ = hex_digit_to_int(p[1])*16 + hex_digit_to_int(p[2]);
718
0
              p += 2;
719
0
            } else if ( p[1] == '\r' && p[2] == '\n' ) {
720
0
              p += 2;
721
0
            }
722
0
          }
723
0
        }
724
0
        p++;
725
0
        break;
726
0
      default:
727
0
        *d++ = *p++;
728
0
        break;
729
0
    }
730
0
  }
731
0
  return (d-dest);
732
0
}
733
734
// ----------------------------------------------------------------------
735
// size_t QEncodingUnescape()
736
//
737
// This is very similar to QuotedPrintableUnescape except that we convert
738
// '_'s into spaces. (See RFC 2047)
739
// ----------------------------------------------------------------------
740
0
size_t QEncodingUnescape(const char *source, size_t slen, char *dest, size_t szdest) {
741
0
  char* d = dest;
742
0
  const char* p = source;
743
744
0
  while ( p < source+slen && *p != '\0' && d < dest+szdest ) {
745
0
    switch (*p) {
746
0
      case '=':
747
        // If it's valid, convert to hex and insert or remove line-wrap.
748
        // In the case of line-wrap removal, the assumption is that this
749
        // is an RFC-compliant message with lines terminated by CRLF.
750
0
        if (p < source+slen-2) {
751
0
          if ( ascii_isxdigit(p[1]) && ascii_isxdigit(p[2]) ) {
752
0
            *d++ = hex_digit_to_int(p[1])*16 + hex_digit_to_int(p[2]);
753
0
            p += 2;
754
0
          } else if ( p[1] == '\r' && p[2] == '\n' ) {
755
0
            p += 2;
756
0
          }
757
0
        }
758
0
        p++;
759
0
        break;
760
0
      case '_':   // According to rfc2047, _'s are to be treated as spaces
761
0
        *d++ = ' ';
762
0
        p++;
763
0
        break;
764
0
      default:
765
0
        *d++ = *p++;
766
0
        break;
767
0
    }
768
0
  }
769
0
  return (d-dest);
770
0
}
771
772
1
size_t CalculateBase64EscapedLen(size_t input_len, bool do_padding) {
773
  // Base64 encodes three bytes of input at a time. If the input is not
774
  // divisible by three, we pad as appropriate.
775
  //
776
  // (from http://www.ietf.org/rfc/rfc3548.txt)
777
  // Special processing is performed if fewer than 24 bits are available
778
  // at the end of the data being encoded.  A full encoding quantum is
779
  // always completed at the end of a quantity.  When fewer than 24 input
780
  // bits are available in an input group, zero bits are added (on the
781
  // right) to form an integral number of 6-bit groups.  Padding at the
782
  // end of the data is performed using the '=' character.  Since all base
783
  // 64 input is an integral number of octets, only the following cases
784
  // can arise:
785
786
787
  // Base64 encodes each three bytes of input into four bytes of output.
788
1
  auto len = (input_len / 3) * 4;
789
790
1
  if (input_len % 3 == 0) {
791
    // (from http://www.ietf.org/rfc/rfc3548.txt)
792
    // (1) the final quantum of encoding input is an integral multiple of 24
793
    // bits; here, the final unit of encoded output will be an integral
794
    // multiple of 4 characters with no "=" padding,
795
1
  } else 
if (0
input_len % 3 == 10
) {
796
    // (from http://www.ietf.org/rfc/rfc3548.txt)
797
    // (2) the final quantum of encoding input is exactly 8 bits; here, the
798
    // final unit of encoded output will be two characters followed by two
799
    // "=" padding characters, or
800
0
    len += 2;
801
0
    if (do_padding) {
802
0
      len += 2;
803
0
    }
804
0
  } else {  // (input_len % 3 == 2)
805
    // (from http://www.ietf.org/rfc/rfc3548.txt)
806
    // (3) the final quantum of encoding input is exactly 16 bits; here, the
807
    // final unit of encoded output will be three characters followed by one
808
    // "=" padding character.
809
0
    len += 3;
810
0
    if (do_padding) {
811
0
      len += 1;
812
0
    }
813
0
  }
814
815
1
  assert(len >= input_len);  // make sure we didn't overflow
816
0
  return len;
817
1
}
818
819
// Base64Escape does padding, so this calculation includes padding.
820
0
size_t CalculateBase64EscapedLen(size_t input_len) {
821
0
  return CalculateBase64EscapedLen(input_len, true);
822
0
}
823
824
// ----------------------------------------------------------------------
825
// size_t Base64Unescape() - base64 decoder
826
// size_t Base64Escape() - base64 encoder
827
// size_t WebSafeBase64Unescape() - Google's variation of base64 decoder
828
// size_t WebSafeBase64Escape() - Google's variation of base64 encoder
829
//
830
// Check out
831
// http://www.cis.ohio-state.edu/htbin/rfc/rfc2045.html for formal
832
// description, but what we care about is that...
833
//   Take the encoded stuff in groups of 4 characters and turn each
834
//   character into a code 0 to 63 thus:
835
//           A-Z map to 0 to 25
836
//           a-z map to 26 to 51
837
//           0-9 map to 52 to 61
838
//           +(- for WebSafe) maps to 62
839
//           /(_ for WebSafe) maps to 63
840
//   There will be four numbers, all less than 64 which can be represented
841
//   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
842
//   Arrange the 6 digit binary numbers into three bytes as such:
843
//   aaaaaabb bbbbcccc ccdddddd
844
//   Equals signs (one or two) are used at the end of the encoded block to
845
//   indicate that the text was not an integer multiple of three bytes long.
846
// In the sorted variation, we instead use the mapping
847
//           .   maps to 0
848
//           0-9 map to 1-10
849
//           A-Z map to 11-37
850
//           _   maps to 38
851
//           a-z map to 39-63
852
// This mapping has the property that the output will be sorted in the same
853
// order as the input, i.e. a < b iff map(a) < map(b). It is web-safe and
854
// filename-safe.
855
// ----------------------------------------------------------------------
856
857
size_t Base64UnescapeInternal(
858
1
    const char *signed_src, size_t szsrc, char *dest, size_t szdest, const signed char* unbase64) {
859
1
  static const char kPad64 = '=';
860
1
  auto* src = static_cast<const unsigned char*>(static_cast<const void*>(signed_src));
861
862
1
  int decode = 0;
863
1
  size_t destidx = 0;
864
1
  int state = 0;
865
1
  unsigned int ch = 0;
866
1
  unsigned int temp = 0;
867
868
  // The GET_INPUT macro gets the next input character, skipping
869
  // over any whitespace, and stopping when we reach the end of the
870
  // string or when we read any non-data character.  The arguments are
871
  // an arbitrary identifier (used as a label for goto) and the number
872
  // of data bytes that must remain in the input to avoid aborting the
873
  // loop.
874
1
#define GET_INPUT(label, remain)                 \
875
1
  label:                                         \
876
0
    --szsrc;                                     \
877
0
    ch = *src++;                                 \
878
0
    decode = unbase64[ch];                       \
879
0
    if (decode < 0) {                            \
880
0
      if (ascii_isspace(ch) && szsrc >= remain)  \
881
0
        goto label;                              \
882
0
      state = 4 - remain;                        \
883
0
      break;                                     \
884
0
    }
885
886
  // if dest is null, we're just checking to see if it's legal input
887
  // rather than producing output.  (I suspect this could just be done
888
  // with a regexp...).  We duplicate the loop so this test can be
889
  // outside it instead of in every iteration.
890
891
1
  if (dest) {
892
    // This loop consumes 4 input bytes and produces 3 output bytes
893
    // per iteration.  We can't know at the start that there is enough
894
    // data left in the string for a full iteration, so the loop may
895
    // break out in the middle; if so 'state' will be set to the
896
    // number of input bytes read.
897
898
27
    while (szsrc >= 4)  {
899
      // We'll start by optimistically assuming that the next four
900
      // bytes of the string (src[0..3]) are four good data bytes
901
      // (that is, no nulls, whitespace, padding chars, or illegal
902
      // chars).  We need to test src[0..2] for nulls individually
903
      // before constructing temp to preserve the property that we
904
      // never read past a null in the string (no matter how long
905
      // szsrc claims the string is).
906
907
26
      if (!src[0] || !src[1] || !src[2] ||
908
26
          (temp = ((unbase64[src[0]] << 18) |
909
26
                   (unbase64[src[1]] << 12) |
910
26
                   (unbase64[src[2]] << 6) |
911
26
                   (unbase64[src[3]]))) & 0x80000000) {
912
        // Iff any of those four characters was bad (null, illegal,
913
        // whitespace, padding), then temp's high bit will be set
914
        // (because unbase64[] is -1 for all bad characters).
915
        //
916
        // We'll back up and resort to the slower decoder, which knows
917
        // how to handle those cases.
918
919
0
        GET_INPUT(first, 4);
920
0
        temp = decode;
921
0
        GET_INPUT(second, 3);
922
0
        temp = (temp << 6) | decode;
923
0
        GET_INPUT(third, 2);
924
0
        temp = (temp << 6) | decode;
925
0
        GET_INPUT(fourth, 1);
926
0
        temp = (temp << 6) | decode;
927
26
      } else {
928
        // We really did have four good data bytes, so advance four
929
        // characters in the string.
930
931
26
        szsrc -= 4;
932
26
        src += 4;
933
26
        decode = -1;
934
26
        ch = '\0';
935
26
      }
936
937
      // temp has 24 bits of input, so write that out as three bytes.
938
939
26
      if (destidx+3 > szdest) 
return -10
;
940
26
      dest[destidx+2] = temp;
941
26
      temp >>= 8;
942
26
      dest[destidx+1] = temp;
943
26
      temp >>= 8;
944
26
      dest[destidx] = temp;
945
26
      destidx += 3;
946
26
    }
947
1
  } else {
948
0
    while (szsrc >= 4)  {
949
0
      if (!src[0] || !src[1] || !src[2] ||
950
0
          (temp = ((unbase64[src[0]] << 18) |
951
0
                   (unbase64[src[1]] << 12) |
952
0
                   (unbase64[src[2]] << 6) |
953
0
                   (unbase64[src[3]]))) & 0x80000000) {
954
0
        GET_INPUT(first_no_dest, 4);
955
0
        GET_INPUT(second_no_dest, 3);
956
0
        GET_INPUT(third_no_dest, 2);
957
0
        GET_INPUT(fourth_no_dest, 1);
958
0
      } else {
959
0
        szsrc -= 4;
960
0
        src += 4;
961
0
        decode = -1;
962
0
        ch = '\0';
963
0
      }
964
0
      destidx += 3;
965
0
    }
966
0
  }
967
968
1
#undef GET_INPUT
969
970
  // if the loop terminated because we read a bad character, return
971
  // now.
972
1
  if (decode < 0 && ch != '\0' && 
ch != kPad640
&&
!ascii_isspace(ch)0
)
973
0
    return -1;
974
975
1
  if (ch == kPad64) {
976
    // if we stopped by hitting an '=', un-read that character -- we'll
977
    // look at it again when we count to check for the proper number of
978
    // equals signs at the end.
979
0
    ++szsrc;
980
0
    --src;
981
1
  } else {
982
    // This loop consumes 1 input byte per iteration.  It's used to
983
    // clean up the 0-3 input bytes remaining when the first, faster
984
    // loop finishes.  'temp' contains the data from 'state' input
985
    // characters read by the first loop.
986
1
    while (szsrc > 0)  {
987
0
      --szsrc;
988
0
      ch = *src++;
989
0
      decode = unbase64[ch];
990
0
      if (decode < 0) {
991
0
        if (ascii_isspace(ch)) {
992
0
          continue;
993
0
        } else if (ch == '\0') {
994
0
          break;
995
0
        } else if (ch == kPad64) {
996
          // back up one character; we'll read it again when we check
997
          // for the correct number of equals signs at the end.
998
0
          ++szsrc;
999
0
          --src;
1000
0
          break;
1001
0
        } else {
1002
0
          return -1;
1003
0
        }
1004
0
      }
1005
1006
      // Each input character gives us six bits of output.
1007
0
      temp = (temp << 6) | decode;
1008
0
      ++state;
1009
0
      if (state == 4) {
1010
        // If we've accumulated 24 bits of output, write that out as
1011
        // three bytes.
1012
0
        if (dest) {
1013
0
          if (destidx+3 > szdest) return -1;
1014
0
          dest[destidx+2] = temp;
1015
0
          temp >>= 8;
1016
0
          dest[destidx+1] = temp;
1017
0
          temp >>= 8;
1018
0
          dest[destidx] = temp;
1019
0
        }
1020
0
        destidx += 3;
1021
0
        state = 0;
1022
0
        temp = 0;
1023
0
      }
1024
0
    }
1025
1
  }
1026
1027
  // Process the leftover data contained in 'temp' at the end of the input.
1028
1
  size_t expected_equals = 0;
1029
1
  switch (state) {
1030
1
    case 0:
1031
      // Nothing left over; output is a multiple of 3 bytes.
1032
1
      break;
1033
1034
0
    case 1:
1035
      // Bad input; we have 6 bits left over.
1036
0
      return -1;
1037
1038
0
    case 2:
1039
      // Produce one more output byte from the 12 input bits we have left.
1040
0
      if (dest) {
1041
0
        if (destidx+1 > szdest) return -1;
1042
0
        temp >>= 4;
1043
0
        dest[destidx] = temp;
1044
0
      }
1045
0
      ++destidx;
1046
0
      expected_equals = 2;
1047
0
      break;
1048
1049
0
    case 3:
1050
      // Produce two more output bytes from the 18 input bits we have left.
1051
0
      if (dest) {
1052
0
        if (destidx+2 > szdest) return -1;
1053
0
        temp >>= 2;
1054
0
        dest[destidx+1] = temp;
1055
0
        temp >>= 8;
1056
0
        dest[destidx] = temp;
1057
0
      }
1058
0
      destidx += 2;
1059
0
      expected_equals = 1;
1060
0
      break;
1061
1062
0
    default:
1063
      // state should have no other values at this point.
1064
0
      LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1065
1
  }
1066
1067
  // The remainder of the string should be all whitespace, mixed with
1068
  // exactly 0 equals signs, or exactly 'expected_equals' equals
1069
  // signs.  (Always accepting 0 equals signs is a google extension
1070
  // not covered in the RFC.)
1071
1072
1
  size_t equals = 0;
1073
1
  while (szsrc > 0 && 
*src0
) {
1074
0
    if (*src == kPad64)
1075
0
      ++equals;
1076
0
    else if (!ascii_isspace(*src))
1077
0
      return -1;
1078
0
    --szsrc;
1079
0
    ++src;
1080
0
  }
1081
1082
1
  return (equals == 0 || 
equals == expected_equals0
) ? destidx :
-10
;
1083
1
}
1084
1085
// The arrays below were generated by the following code
1086
// #include <sys/time.h>
1087
// #include <stdlib.h>
1088
// #include <string.h>
1089
// main()
1090
// {
1091
//   static const char Base64[] =
1092
//     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1093
//   char *pos;
1094
//   int idx, i, j;
1095
//   printf("    ");
1096
//   for (i = 0; i < 255; i += 8) {
1097
//     for (j = i; j < i + 8; j++) {
1098
//       pos = strchr(Base64, j);
1099
//       if ((pos == NULL) || (j == 0))
1100
//         idx = -1;
1101
//       else
1102
//         idx = pos - Base64;
1103
//       if (idx == -1)
1104
//         printf(" %2d,     ", idx);
1105
//       else
1106
//         printf(" %2d/*%c*/,", idx, j);
1107
//     }
1108
//     printf("\n    ");
1109
//   }
1110
// }
1111
//
1112
// where the value of "Base64[]" was replaced by one of the base-64 conversion
1113
// tables from the functions below.
1114
static const signed char kUnBase64[] = {
1115
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1116
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1117
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1118
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1119
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1120
  -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
1121
  52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1122
  60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
1123
  -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
1124
  07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1125
  15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1126
  23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
1127
  -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
1128
  33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
1129
  41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
1130
  49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
1131
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1132
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1133
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1134
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1135
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1136
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1137
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1138
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1139
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1140
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1141
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1142
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1143
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1144
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1145
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1146
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
1147
};
1148
static const signed char kUnWebSafeBase64[] = {
1149
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1150
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1151
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1152
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1153
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1154
  -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
1155
  52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1156
  60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
1157
  -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
1158
  07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1159
  15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1160
  23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
1161
  -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
1162
  33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
1163
  41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
1164
  49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
1165
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1166
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1167
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1168
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1169
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1170
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1171
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1172
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1173
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1174
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1175
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1176
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1177
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1178
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1179
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1180
  -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
1181
};
1182
1183
0
size_t Base64Unescape(const char *src, size_t szsrc, char *dest, size_t szdest) {
1184
0
  return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnBase64);
1185
0
}
1186
1187
0
size_t WebSafeBase64Unescape(const char *src, size_t szsrc, char *dest, size_t szdest) {
1188
0
  return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
1189
0
}
1190
1191
static bool Base64UnescapeInternal(const char* src, size_t slen, string* dest,
1192
1
                                   const signed char* unbase64) {
1193
  // Determine the size of the output string.  Base64 encodes every 3 bytes into
1194
  // 4 characters.  any leftover chars are added directly for good measure.
1195
  // This is documented in the base64 RFC: http://www.ietf.org/rfc/rfc3548.txt
1196
1
  const size_t dest_len = 3 * (slen / 4) + (slen % 4);
1197
1198
1
  dest->clear();
1199
1
  dest->resize(dest_len);
1200
1201
  // We are getting the destination buffer by getting the beginning of the
1202
  // string and converting it into a char *.
1203
1
  const auto len = Base64UnescapeInternal(src, slen, string_as_array(dest), dest->size(), unbase64);
1204
1
  if (len < 0) {
1205
0
    dest->clear();
1206
0
    return false;
1207
0
  }
1208
1209
  // could be shorter if there was padding
1210
1
  DCHECK_LE(len, dest_len);
1211
1
  dest->resize(len);
1212
1213
1
  return true;
1214
1
}
1215
1216
1
bool Base64Unescape(const char *src, size_t slen, string* dest) {
1217
1
  return Base64UnescapeInternal(src, slen, dest, kUnBase64);
1218
1
}
1219
1220
0
bool WebSafeBase64Unescape(const char *src, size_t slen, string* dest) {
1221
0
  return Base64UnescapeInternal(src, slen, dest, kUnWebSafeBase64);
1222
0
}
1223
1224
size_t Base64EscapeInternal(
1225
    const unsigned char *src, size_t szsrc, char *dest, size_t szdest, const char *base64,
1226
1
    bool do_padding) {
1227
1
  static const char kPad64 = '=';
1228
1229
1
  if (szsrc <= 0) 
return 00
;
1230
1231
1
  char *cur_dest = dest;
1232
1
  const unsigned char *cur_src = src;
1233
1234
  // Three bytes of data encodes to four characters of cyphertext.
1235
  // So we can pump through three-byte chunks atomically.
1236
27
  while (szsrc > 2) { /* keep going until we have less than 24 bits */
1237
26
    if ((szdest -= 4) < 0) 
return 00
;
1238
26
    cur_dest[0] = base64[cur_src[0] >> 2];
1239
26
    cur_dest[1] = base64[((cur_src[0] & 0x03) << 4) + (cur_src[1] >> 4)];
1240
26
    cur_dest[2] = base64[((cur_src[1] & 0x0f) << 2) + (cur_src[2] >> 6)];
1241
26
    cur_dest[3] = base64[cur_src[2] & 0x3f];
1242
1243
26
    cur_dest += 4;
1244
26
    cur_src += 3;
1245
26
    szsrc -= 3;
1246
26
  }
1247
1248
  /* now deal with the tail (<=2 bytes) */
1249
1
  switch (szsrc) {
1250
1
    case 0:
1251
      // Nothing left; nothing more to do.
1252
1
      break;
1253
0
    case 1:
1254
      // One byte left: this encodes to two characters, and (optionally)
1255
      // two pad characters to round out the four-character cypherblock.
1256
0
      if ((szdest -= 2) < 0) return 0;
1257
0
      cur_dest[0] = base64[cur_src[0] >> 2];
1258
0
      cur_dest[1] = base64[(cur_src[0] & 0x03) << 4];
1259
0
      cur_dest += 2;
1260
0
      if (do_padding) {
1261
0
        if ((szdest -= 2) < 0) return 0;
1262
0
        cur_dest[0] = kPad64;
1263
0
        cur_dest[1] = kPad64;
1264
0
        cur_dest += 2;
1265
0
      }
1266
0
      break;
1267
0
    case 2:
1268
      // Two bytes left: this encodes to three characters, and (optionally)
1269
      // one pad character to round out the four-character cypherblock.
1270
0
      if ((szdest -= 3) < 0) return 0;
1271
0
      cur_dest[0] = base64[cur_src[0] >> 2];
1272
0
      cur_dest[1] = base64[((cur_src[0] & 0x03) << 4) + (cur_src[1] >> 4)];
1273
0
      cur_dest[2] = base64[(cur_src[1] & 0x0f) << 2];
1274
0
      cur_dest += 3;
1275
0
      if (do_padding) {
1276
0
        if ((szdest -= 1) < 0) return 0;
1277
0
        cur_dest[0] = kPad64;
1278
0
        cur_dest += 1;
1279
0
      }
1280
0
      break;
1281
0
    default:
1282
      // Should not be reached: blocks of 3 bytes are handled
1283
      // in the while loop before this switch statement.
1284
0
      LOG_ASSERT(false) << "Logic problem? szsrc = " << szsrc;
1285
0
      break;
1286
1
  }
1287
1
  return (cur_dest - dest);
1288
1
}
1289
1290
static const char kBase64Chars[] =
1291
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1292
1293
static const char kWebSafeBase64Chars[] =
1294
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
1295
1296
0
size_t Base64Escape(const unsigned char *src, size_t szsrc, char *dest, size_t szdest) {
1297
0
  return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
1298
0
}
1299
size_t WebSafeBase64Escape(
1300
0
    const unsigned char *src, size_t szsrc, char *dest, size_t szdest, bool do_padding) {
1301
0
  return Base64EscapeInternal(src, szsrc, dest, szdest,
1302
0
                              kWebSafeBase64Chars, do_padding);
1303
0
}
1304
1305
void Base64EscapeInternal(const unsigned char* src, size_t szsrc,
1306
                          string* dest, bool do_padding,
1307
1
                          const char* base64_chars) {
1308
1
  const auto calc_escaped_size =
1309
1
    CalculateBase64EscapedLen(szsrc, do_padding);
1310
1
  dest->clear();
1311
1
  dest->resize(calc_escaped_size, '\0');
1312
1
  const auto escaped_len = Base64EscapeInternal(src, szsrc,
1313
1
                                                string_as_array(dest),
1314
1
                                                dest->size(),
1315
1
                                                base64_chars,
1316
1
                                                do_padding);
1317
1
  DCHECK_EQ(calc_escaped_size, escaped_len);
1318
1
}
1319
1320
void Base64Escape(const unsigned char *src, size_t szsrc,
1321
1
                  string* dest, bool do_padding) {
1322
1
  Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
1323
1
}
1324
1325
void WebSafeBase64Escape(const unsigned char *src, size_t szsrc,
1326
0
                         string *dest, bool do_padding) {
1327
0
  Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
1328
0
}
1329
1330
1
void Base64Escape(const string& src, string* dest) {
1331
1
  Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
1332
1
               src.size(), dest, true);
1333
1
}
1334
1335
0
void WebSafeBase64Escape(const string& src, string* dest) {
1336
0
  WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
1337
0
                      src.size(), dest, false);
1338
0
}
1339
1340
0
void WebSafeBase64EscapeWithPadding(const string& src, string* dest) {
1341
0
  WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
1342
0
                      src.size(), dest, true);
1343
0
}
1344
1345
// Returns true iff c is in the Base 32 alphabet.
1346
0
bool ValidBase32Byte(char c) {
1347
0
  return (c >= 'A' && c <= 'Z') || (c >= '2' && c <= '7') || c == '=';
1348
0
}
1349
1350
// Mapping from number of Base32 escaped characters (0 through 8) to number of
1351
// unescaped bytes.  8 Base32 escaped characters represent 5 unescaped bytes.
1352
// For N < 8, then number of unescaped bytes is less than 5.  Note that in
1353
// valid input, N can only be 0, 2, 4, 5, 7, or 8 (corresponding to 0, 1, 2,
1354
// 3, 4, or 5 unescaped bytes).
1355
//
1356
// We use 5 for invalid values of N to be safe, since this is used to compute
1357
// the length of the buffer to hold unescaped data.
1358
//
1359
// See http://tools.ietf.org/html/rfc4648#section-6 for details.
1360
static const size_t kBase32NumUnescapedBytes[] = {
1361
  0, 5, 1, 5, 2, 3, 5, 4, 5
1362
};
1363
1364
0
size_t Base32Unescape(const char* src, size_t slen, char* dest, size_t szdest) {
1365
0
  size_t destidx = 0;
1366
0
  char escaped_bytes[8];
1367
0
  unsigned char unescaped_bytes[5];
1368
0
  while (slen > 0) {
1369
    // Collect the next 8 escaped bytes and convert to upper case.  If there
1370
    // are less than 8 bytes left, pad with '=', but keep track of the number
1371
    // of non-padded bytes for later.
1372
0
    size_t non_padded_len = 8;
1373
0
    for (size_t i = 0; i < 8; ++i) {
1374
0
      escaped_bytes[i] = (i < slen) ? ascii_toupper(src[i]) : '=';
1375
0
      if (!ValidBase32Byte(escaped_bytes[i])) {
1376
0
        return -1;
1377
0
      }
1378
      // Stop counting escaped bytes at first '='.
1379
0
      if (escaped_bytes[i] == '=' && non_padded_len == 8) {
1380
0
        non_padded_len = i;
1381
0
      }
1382
0
    }
1383
1384
    // Convert the 8 escaped bytes to 5 unescaped bytes and copy to dest.
1385
0
    EightBase32DigitsToFiveBytes(escaped_bytes, unescaped_bytes);
1386
0
    const auto num_unescaped = kBase32NumUnescapedBytes[non_padded_len];
1387
0
    for (size_t i = 0; i < num_unescaped; ++i) {
1388
0
      if (destidx == szdest) {
1389
        // No more room in dest, so terminate early.
1390
0
        return -1;
1391
0
      }
1392
0
      dest[destidx] = unescaped_bytes[i];
1393
0
      ++destidx;
1394
0
    }
1395
0
    src += 8;
1396
0
    slen -= 8;
1397
0
  }
1398
0
  return destidx;
1399
0
}
1400
1401
0
bool Base32Unescape(const char* src, size_t slen, string* dest) {
1402
  // Determine the size of the output string.
1403
0
  const auto dest_len = 5 * (slen / 8) + kBase32NumUnescapedBytes[slen % 8];
1404
1405
0
  dest->clear();
1406
0
  dest->resize(dest_len);
1407
1408
  // We are getting the destination buffer by getting the beginning of the
1409
  // string and converting it into a char *.
1410
0
  const auto len = Base32Unescape(src, slen, string_as_array(dest), dest->size());
1411
0
  if (len < 0) {
1412
0
    dest->clear();
1413
0
    return false;
1414
0
  }
1415
1416
  // Could be shorter if there was padding.
1417
0
  DCHECK_LE(len, dest_len);
1418
0
  dest->resize(len);
1419
1420
0
  return true;
1421
0
}
1422
1423
void GeneralFiveBytesToEightBase32Digits(const unsigned char *in_bytes,
1424
0
                                         char *out, const char *alphabet) {
1425
  // It's easier to just hard code this.
1426
  // The conversion isbased on the following picture of the division of a
1427
  // 40-bit block into 8 5-byte words:
1428
  //
1429
  //       5   3  2  5  1  4   4 1  5  2  3   5
1430
  //     |:::::::|:::::::|:::::::|:::::::|:::::::
1431
  //     +----+----+----+----+----+----+----+----
1432
  //
1433
0
  out[0] = alphabet[in_bytes[0] >> 3];
1434
0
  out[1] = alphabet[(in_bytes[0] & 0x07) << 2 | in_bytes[1] >> 6];
1435
0
  out[2] = alphabet[(in_bytes[1] & 0x3E) >> 1];
1436
0
  out[3] = alphabet[(in_bytes[1] & 0x01) << 4 | in_bytes[2] >> 4];
1437
0
  out[4] = alphabet[(in_bytes[2] & 0x0F) << 1 | in_bytes[3] >> 7];
1438
0
  out[5] = alphabet[(in_bytes[3] & 0x7C) >> 2];
1439
0
  out[6] = alphabet[(in_bytes[3] & 0x03) << 3 | in_bytes[4] >> 5];
1440
0
  out[7] = alphabet[(in_bytes[4] & 0x1F)];
1441
0
}
1442
1443
static size_t GeneralBase32Escape(
1444
0
    const unsigned char *src, size_t szsrc, char *dest, size_t szdest, const char *alphabet) {
1445
0
  static const char kPad32 = '=';
1446
1447
0
  if (szsrc == 0) return 0;
1448
1449
0
  char *cur_dest = dest;
1450
0
  const unsigned char *cur_src = src;
1451
1452
  // Five bytes of data encodes to eight characters of cyphertext.
1453
  // So we can pump through three-byte chunks atomically.
1454
0
  while (szsrc > 4) {  // keep going until we have less than 40 bits
1455
0
    if ( szdest < 8) return 0;
1456
0
    szdest -= 8;
1457
1458
0
    GeneralFiveBytesToEightBase32Digits(cur_src, cur_dest, alphabet);
1459
1460
0
    cur_dest += 8;
1461
0
    cur_src += 5;
1462
0
    szsrc -= 5;
1463
0
  }
1464
1465
  // Now deal with the tail (<=4 bytes).
1466
0
  if (szsrc > 0) {
1467
0
    if ( szdest < 8) return 0;
1468
0
    szdest -= 8;
1469
0
    unsigned char last_chunk[5];
1470
0
    memcpy(last_chunk, cur_src, szsrc);
1471
1472
0
    for (size_t i = szsrc; i < 5; ++i) {
1473
0
      last_chunk[i] = '\0';
1474
0
    }
1475
1476
0
    GeneralFiveBytesToEightBase32Digits(last_chunk, cur_dest, alphabet);
1477
0
    auto filled = (szsrc * 8) / 5 + 1;
1478
0
    cur_dest += filled;
1479
1480
    // Add on the padding.
1481
0
    for (size_t i = 0; i < (8 - filled); ++i) {
1482
0
      *(cur_dest++) = kPad32;
1483
0
    }
1484
0
  }
1485
1486
0
  return cur_dest - dest;
1487
0
}
1488
1489
static bool GeneralBase32Escape(const string& src, string* dest,
1490
0
                                const char *alphabet) {
1491
0
  const auto max_escaped_size = CalculateBase32EscapedLen(src.length());
1492
0
  dest->clear();
1493
0
  dest->resize(max_escaped_size + 1, '\0');
1494
0
  const auto escaped_len =
1495
0
      GeneralBase32Escape(reinterpret_cast<const unsigned char *>(src.c_str()),
1496
0
                          src.length(),  &*dest->begin(), dest->size(),
1497
0
                          alphabet);
1498
1499
0
  DCHECK_LE(max_escaped_size, escaped_len);
1500
1501
0
  if (escaped_len < 0) {
1502
0
    dest->clear();
1503
0
    return false;
1504
0
  }
1505
1506
0
  dest->resize(escaped_len);
1507
0
  return true;
1508
0
}
1509
1510
static const char Base32Alphabet[] = {
1511
  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
1512
  'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
1513
  'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
1514
  'Y', 'Z', '2', '3', '4', '5', '6', '7'
1515
  };
1516
1517
0
size_t Base32Escape(const unsigned char* src, size_t szsrc, char* dest, size_t szdest) {
1518
0
  return GeneralBase32Escape(src, szsrc, dest, szdest, Base32Alphabet);
1519
0
}
1520
1521
0
bool Base32Escape(const string& src, string* dest) {
1522
0
  return GeneralBase32Escape(src, dest, Base32Alphabet);
1523
0
}
1524
1525
0
void FiveBytesToEightBase32Digits(const unsigned char *in_bytes, char *out) {
1526
0
  GeneralFiveBytesToEightBase32Digits(in_bytes, out, Base32Alphabet);
1527
0
}
1528
1529
static const char Base32HexAlphabet[] = {
1530
  '0', '1', '2', '3', '4', '5', '6', '7',
1531
  '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
1532
  'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
1533
  'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
1534
  };
1535
1536
0
size_t Base32HexEscape(const unsigned char* src, size_t szsrc, char* dest, size_t szdest) {
1537
0
  return GeneralBase32Escape(src, szsrc, dest, szdest, Base32HexAlphabet);
1538
0
}
1539
1540
0
bool Base32HexEscape(const string& src, string* dest) {
1541
0
  return GeneralBase32Escape(src, dest, Base32HexAlphabet);
1542
0
}
1543
1544
0
size_t CalculateBase32EscapedLen(size_t input_len) {
1545
0
  DCHECK_LE(input_len, numeric_limits<size_t>::max() / 8);
1546
0
  size_t intermediate_result = 8 * input_len + 4;
1547
0
  size_t len = intermediate_result / 5;
1548
0
  len = (len + 7) & ~7;
1549
0
  return len;
1550
0
}
1551
1552
// ----------------------------------------------------------------------
1553
// EightBase32DigitsToTenHexDigits()
1554
//   Converts an 8-digit base32 string to a 10-digit hex string.
1555
//
1556
//   *in must point to 8 base32 digits.
1557
//   *out must point to 10 bytes.
1558
//
1559
//   Base32 uses A-Z,2-7 to represent the numbers 0-31.
1560
//   See RFC3548 at http://www.ietf.org/rfc/rfc3548.txt
1561
//   for details on base32.
1562
// ----------------------------------------------------------------------
1563
1564
1565
0
void EightBase32DigitsToTenHexDigits(const char *in, char *out) {
1566
0
  unsigned char bytes[5];
1567
0
  EightBase32DigitsToFiveBytes(in, bytes);
1568
0
  b2a_hex(bytes, out, 5);
1569
0
}
1570
1571
0
void EightBase32DigitsToFiveBytes(const char *signed_in, unsigned char *bytes_out) {
1572
0
  auto* in = static_cast<const unsigned char*>(static_cast<const void*>(signed_in));
1573
1574
0
  static const char Base32InverseAlphabet[] = {
1575
0
    99,      99,      99,      99,      99,      99,      99,      99,
1576
0
    99,      99,      99,      99,      99,      99,      99,      99,
1577
0
    99,      99,      99,      99,      99,      99,      99,      99,
1578
0
    99,      99,      99,      99,      99,      99,      99,      99,
1579
0
    99,      99,      99,      99,      99,      99,      99,      99,
1580
0
    99,      99,      99,      99,      99,      99,      99,      99,
1581
0
    99,      99,      26/*2*/, 27/*3*/, 28/*4*/, 29/*5*/, 30/*6*/, 31/*7*/,
1582
0
    99,      99,      99,      99,      99,      00/*=*/, 99,      99,
1583
0
    99,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
1584
0
     7/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1585
0
    15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1586
0
    23/*X*/, 24/*Y*/, 25/*Z*/, 99,      99,      99,      99,      99,
1587
0
    99,      99,      99,      99,      99,      99,      99,      99,
1588
0
    99,      99,      99,      99,      99,      99,      99,      99,
1589
0
    99,      99,      99,      99,      99,      99,      99,      99,
1590
0
    99,      99,      99,      99,      99,      99,      99,      99,
1591
0
    99,      99,      99,      99,      99,      99,      99,      99,
1592
0
    99,      99,      99,      99,      99,      99,      99,      99,
1593
0
    99,      99,      99,      99,      99,      99,      99,      99,
1594
0
    99,      99,      99,      99,      99,      99,      99,      99,
1595
0
    99,      99,      99,      99,      99,      99,      99,      99,
1596
0
    99,      99,      99,      99,      99,      99,      99,      99,
1597
0
    99,      99,      99,      99,      99,      99,      99,      99,
1598
0
    99,      99,      99,      99,      99,      99,      99,      99,
1599
0
    99,      99,      99,      99,      99,      99,      99,      99,
1600
0
    99,      99,      99,      99,      99,      99,      99,      99,
1601
0
    99,      99,      99,      99,      99,      99,      99,      99,
1602
0
    99,      99,      99,      99,      99,      99,      99,      99,
1603
0
    99,      99,      99,      99,      99,      99,      99,      99,
1604
0
    99,      99,      99,      99,      99,      99,      99,      99,
1605
0
    99,      99,      99,      99,      99,      99,      99,      99,
1606
0
    99,      99,      99,      99,      99,      99,      99,      99
1607
0
  };
1608
1609
  // Convert to raw bytes. It's easier to just hard code this.
1610
0
  bytes_out[0] = Base32InverseAlphabet[in[0]] << 3 |
1611
0
                 Base32InverseAlphabet[in[1]] >> 2;
1612
1613
0
  bytes_out[1] = Base32InverseAlphabet[in[1]] << 6 |
1614
0
                 Base32InverseAlphabet[in[2]] << 1 |
1615
0
                 Base32InverseAlphabet[in[3]] >> 4;
1616
1617
0
  bytes_out[2] = Base32InverseAlphabet[in[3]] << 4 |
1618
0
                 Base32InverseAlphabet[in[4]] >> 1;
1619
1620
0
  bytes_out[3] = Base32InverseAlphabet[in[4]] << 7 |
1621
0
                 Base32InverseAlphabet[in[5]] << 2 |
1622
0
                 Base32InverseAlphabet[in[6]] >> 3;
1623
1624
0
  bytes_out[4] = Base32InverseAlphabet[in[6]] << 5 |
1625
0
                 Base32InverseAlphabet[in[7]];
1626
0
}
1627
1628
// ----------------------------------------------------------------------
1629
// TenHexDigitsToEightBase32Digits()
1630
//   Converts a 10-digit hex string to an 8-digit base32 string.
1631
//
1632
//   *in must point to 10 hex digits.
1633
//   *out must point to 8 bytes.
1634
//
1635
//   See RFC3548 at http://www.ietf.org/rfc/rfc3548.txt
1636
//   for details on base32.
1637
// ----------------------------------------------------------------------
1638
0
void TenHexDigitsToEightBase32Digits(const char *in, char *out) {
1639
0
  unsigned char bytes[5];
1640
1641
  // Convert hex to raw bytes.
1642
0
  a2b_hex(in, bytes, 5);
1643
0
  FiveBytesToEightBase32Digits(bytes, out);
1644
0
}
1645
1646
// ----------------------------------------------------------------------
1647
// EscapeFileName / UnescapeFileName
1648
// ----------------------------------------------------------------------
1649
static const Charmap escape_file_name_exceptions(
1650
    "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"  // letters
1651
    "0123456789"  // digits
1652
    "-_.");
1653
1654
0
void EscapeFileName(const GStringPiece& src, string* dst) {
1655
  // Reserve at least src.size() chars
1656
0
  dst->reserve(dst->size() + src.size());
1657
1658
0
  for (char c : src) {
1659
    // We do not use "isalpha" because we want the behavior to be
1660
    // independent of the current locale settings.
1661
0
    if (escape_file_name_exceptions.contains(c)) {
1662
0
      dst->push_back(c);
1663
1664
0
    } else if (c == '/') {
1665
0
      dst->push_back('~');
1666
1667
0
    } else {
1668
0
      char tmp[2];
1669
0
      b2a_hex(reinterpret_cast<const unsigned char*>(&c), tmp, 1);
1670
0
      dst->push_back('%');
1671
0
      dst->append(tmp, 2);
1672
0
    }
1673
0
  }
1674
0
}
1675
1676
0
void UnescapeFileName(const GStringPiece& src_piece, string* dst) {
1677
0
  const char* src = src_piece.data();
1678
0
  const auto len = src_piece.size();
1679
0
  for (size_t i = 0; i < len; ++i) {
1680
0
    const char c = src[i];
1681
0
    if (c == '~') {
1682
0
      dst->push_back('/');
1683
1684
0
    } else if ((c == '%') && (i + 2 < len)) {
1685
0
      unsigned char tmp[1];
1686
0
      a2b_hex(src + i + 1, &tmp[0], 1);
1687
0
      dst->push_back(tmp[0]);
1688
0
      i += 2;
1689
1690
0
    } else {
1691
0
      dst->push_back(c);
1692
0
    }
1693
0
  }
1694
0
}
1695
1696
static char hex_value[256] = {
1697
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1698
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1699
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1700
  0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0,  // '0'..'9'
1701
  0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'A'..'F'
1702
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1703
  0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'a'..'f'
1704
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1705
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1706
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1707
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1708
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1709
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1710
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1711
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1712
  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1713
};
1714
1715
static char hex_char[] = "0123456789abcdef";
1716
1717
// This is a templated function so that T can be either a char*
1718
// or a string.  This works because we use the [] operator to access
1719
// individual characters at a time.
1720
template <typename T>
1721
12.2k
static void a2b_hex_t(const char* a, T b, size_t num) {
1722
180k
  for (size_t i = 0; i < num; 
i++168k
) {
1723
168k
    b[i] = (hex_value[a[i * 2] & 0xFF] << 4)
1724
168k
         + (hex_value[a[i * 2 + 1] & 0xFF]);
1725
168k
  }
1726
12.2k
}
Unexecuted instantiation: escaping.cc:void strings::a2b_hex_t<unsigned char*>(char const*, unsigned char*, unsigned long)
Unexecuted instantiation: escaping.cc:void strings::a2b_hex_t<char*>(char const*, char*, unsigned long)
escaping.cc:void strings::a2b_hex_t<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&>(char const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, unsigned long)
Line
Count
Source
1721
12.2k
static void a2b_hex_t(const char* a, T b, size_t num) {
1722
180k
  for (size_t i = 0; i < num; 
i++168k
) {
1723
168k
    b[i] = (hex_value[a[i * 2] & 0xFF] << 4)
1724
168k
         + (hex_value[a[i * 2 + 1] & 0xFF]);
1725
168k
  }
1726
12.2k
}
1727
1728
0
string a2b_bin(const string& a, bool byte_order_msb) {
1729
0
  string result;
1730
0
  const char *data = a.c_str();
1731
0
  auto num_bytes = (a.size()+7)/8;
1732
0
  for (size_t byte_offset = 0; byte_offset < num_bytes; ++byte_offset) {
1733
0
    unsigned char c = 0;
1734
0
    for (size_t bit_offset = 0; bit_offset < 8; ++bit_offset) {
1735
0
      if (*data == '\0')
1736
0
        break;
1737
0
      if (*data++ != '0') {
1738
0
        size_t bits_to_shift = (byte_order_msb) ? 7-bit_offset : bit_offset;
1739
0
        c |= (1 << bits_to_shift);
1740
0
      }
1741
0
    }
1742
0
    result.append(1, c);
1743
0
  }
1744
0
  return result;
1745
0
}
1746
1747
// This is a templated function so that T can be either a char*
1748
// or a string.  This works because we use the [] operator to access
1749
// individual characters at a time.
1750
template <typename T>
1751
9.79M
static void b2a_hex_t(const unsigned char* b, T a, size_t num) {
1752
166M
  for (size_t i = 0; i < num; 
i++156M
) {
1753
156M
    a[i * 2 + 0] = hex_char[b[i] >> 4];
1754
156M
    a[i * 2 + 1] = hex_char[b[i] & 0xf];
1755
156M
  }
1756
9.79M
}
Unexecuted instantiation: escaping.cc:void strings::b2a_hex_t<char*>(unsigned char const*, char*, unsigned long)
escaping.cc:void strings::b2a_hex_t<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&>(unsigned char const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, unsigned long)
Line
Count
Source
1751
9.79M
static void b2a_hex_t(const unsigned char* b, T a, size_t num) {
1752
166M
  for (size_t i = 0; i < num; 
i++156M
) {
1753
156M
    a[i * 2 + 0] = hex_char[b[i] >> 4];
1754
156M
    a[i * 2 + 1] = hex_char[b[i] & 0xf];
1755
156M
  }
1756
9.79M
}
1757
1758
0
string b2a_bin(const string& b, bool byte_order_msb) {
1759
0
  string result;
1760
0
  for (char c : b) {
1761
0
    for (size_t bit_offset = 0; bit_offset < 8; ++bit_offset) {
1762
0
      auto x = (byte_order_msb) ? 7-bit_offset : bit_offset;
1763
0
      result.append(1, (c & (1 << x)) ? '1' : '0');
1764
0
    }
1765
0
  }
1766
0
  return result;
1767
0
}
1768
1769
0
void b2a_hex(const unsigned char* b, char* a, size_t num) {
1770
0
  b2a_hex_t<char*>(b, a, num);
1771
0
}
1772
1773
0
void a2b_hex(const char* a, unsigned char* b, size_t num) {
1774
0
  a2b_hex_t<unsigned char*>(a, b, num);
1775
0
}
1776
1777
0
void a2b_hex(const char* a, char* b, size_t num) {
1778
0
  a2b_hex_t<char*>(a, b, num);
1779
0
}
1780
1781
9.79M
string b2a_hex(const char* b, size_t len) {
1782
9.79M
  string result;
1783
9.79M
  result.resize(len << 1);
1784
9.79M
  b2a_hex_t<string&>(reinterpret_cast<const unsigned char*>(b), result, len);
1785
9.79M
  return result;
1786
9.79M
}
1787
1788
20.5k
string b2a_hex(const GStringPiece& b) {
1789
20.5k
  return b2a_hex(b.data(), b.size());
1790
20.5k
}
1791
1792
9.09k
string a2b_hex(const string& a) {
1793
9.09k
  string result;
1794
9.09k
  a2b_hex(a.c_str(), &result, a.size()/2);
1795
1796
9.09k
  return result;
1797
9.09k
}
1798
1799
0
void b2a_hex(const unsigned char* from, string* to, size_t num) {
1800
0
  to->resize(num << 1);
1801
0
  b2a_hex_t<string&>(from, *to, num);
1802
0
}
1803
1804
12.2k
void a2b_hex(const char* from, string* to, size_t num) {
1805
12.2k
  to->resize(num);
1806
12.2k
  a2b_hex_t<string&>(from, *to, num);
1807
12.2k
}
1808
1809
const char* kDontNeedShellEscapeChars =
1810
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.=/:,@";
1811
1812
0
string ShellEscape(GStringPiece src) {
1813
0
  if (!src.empty() &&  // empty string needs quotes
1814
0
      src.find_first_not_of(kDontNeedShellEscapeChars) == GStringPiece::npos) {
1815
    // only contains chars that don't need quotes; it's fine
1816
0
    return src.ToString();
1817
0
  } else if (src.find('\'') == GStringPiece::npos) {
1818
    // no single quotes; just wrap it in single quotes
1819
0
    return StrCat("'", src, "'");
1820
0
  } else {
1821
    // needs double quote escaping
1822
0
    string result = "\"";
1823
0
    for (char c : src) {
1824
0
      switch (c) {
1825
0
        case '\\':
1826
0
        case '$':
1827
0
        case '"':
1828
0
        case '`':
1829
0
          result.push_back('\\');
1830
0
      }
1831
0
      result.push_back(c);
1832
0
    }
1833
0
    result.push_back('"');
1834
0
    return result;
1835
0
  }
1836
0
}
1837
1838
static const char kHexTable[513]=
1839
  "000102030405060708090a0b0c0d0e0f"
1840
  "101112131415161718191a1b1c1d1e1f"
1841
  "202122232425262728292a2b2c2d2e2f"
1842
  "303132333435363738393a3b3c3d3e3f"
1843
  "404142434445464748494a4b4c4d4e4f"
1844
  "505152535455565758595a5b5c5d5e5f"
1845
  "606162636465666768696a6b6c6d6e6f"
1846
  "707172737475767778797a7b7c7d7e7f"
1847
  "808182838485868788898a8b8c8d8e8f"
1848
  "909192939495969798999a9b9c9d9e9f"
1849
  "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
1850
  "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
1851
  "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
1852
  "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
1853
  "e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
1854
  "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";
1855
1856
//------------------------------------------------------------------------
1857
// ByteStringToAscii
1858
//  Reads at most bytes_to_read from binary_string and prints it to
1859
//  ascii_string in downcased hex.
1860
//------------------------------------------------------------------------
1861
0
void ByteStringToAscii(string const &binary_string, size_t bytes_to_read, string* ascii_string) {
1862
0
  if (binary_string.size() < bytes_to_read) {
1863
0
    bytes_to_read = binary_string.size();
1864
0
  }
1865
1866
0
  CHECK_GE(bytes_to_read, 0);
1867
0
  ascii_string->resize(bytes_to_read*2);
1868
1869
0
  string::const_iterator in = binary_string.begin();
1870
0
  string::iterator out = ascii_string->begin();
1871
1872
0
  for (size_t i = 0; i < bytes_to_read; i++) {
1873
0
    *out++ = kHexTable[(*in)*2];
1874
0
    *out++ = kHexTable[(*in)*2 + 1];
1875
0
    ++in;
1876
0
  }
1877
0
}
1878
1879
//------------------------------------------------------------------------
1880
// ByteStringFromAscii
1881
//  Converts the hex from ascii_string into binary data and
1882
//  writes the binary data into binary_string.
1883
//  Empty input successfully converts to empty output.
1884
//  Returns false and may modify output if it is
1885
//  unable to parse the hex string.
1886
//------------------------------------------------------------------------
1887
0
bool ByteStringFromAscii(string const& hex_string, string* binary_string) {
1888
0
  binary_string->clear();
1889
1890
0
  if ((hex_string.size()%2) != 0) {
1891
0
    return false;
1892
0
  }
1893
1894
0
  int value = 0;
1895
0
  for (size_t i = 0; i < hex_string.size(); i++) {
1896
0
    char c = hex_string[i];
1897
1898
0
    if (!ascii_isxdigit(c)) {
1899
0
      return false;
1900
0
    }
1901
1902
0
    if (ascii_isdigit(c)) {
1903
0
      value += c - '0';
1904
0
    } else if (ascii_islower(c)) {
1905
0
      value += 10 + c - 'a';
1906
0
    } else {
1907
0
      value += 10 + c - 'A';
1908
0
    }
1909
1910
0
    if (i & 1) {
1911
0
      binary_string->push_back(value);
1912
0
      value = 0;
1913
0
    } else {
1914
0
      value <<= 4;
1915
0
    }
1916
0
  }
1917
1918
0
  return true;
1919
0
}
1920
1921
// ----------------------------------------------------------------------
1922
// CleanStringLineEndings()
1923
//   Clean up a multi-line string to conform to Unix line endings.
1924
//   Reads from src and appends to dst, so usually dst should be empty.
1925
//
1926
//   If there is no line ending at the end of a non-empty string, it can
1927
//   be added automatically.
1928
//
1929
//   Four different types of input are correctly handled:
1930
//
1931
//     - Unix/Linux files: line ending is LF, pass through unchanged
1932
//
1933
//     - DOS/Windows files: line ending is CRLF: convert to LF
1934
//
1935
//     - Legacy Mac files: line ending is CR: convert to LF
1936
//
1937
//     - Garbled files: random line endings, covert gracefully
1938
//                      lonely CR, lonely LF, CRLF: convert to LF
1939
//
1940
//   @param src The multi-line string to convert
1941
//   @param dst The converted string is appended to this string
1942
//   @param auto_end_last_line Automatically terminate the last line
1943
//
1944
//   Limitations:
1945
//
1946
//     This does not do the right thing for CRCRLF files created by
1947
//     broken programs that do another Unix->DOS conversion on files
1948
//     that are already in CRLF format.  For this, a two-pass approach
1949
//     brute-force would be needed that
1950
//
1951
//       (1) determines the presence of LF (first one is ok)
1952
//       (2) if yes, removes any CR, else convert every CR to LF
1953
1954
void CleanStringLineEndings(const string& src, string* dst,
1955
0
                            bool auto_end_last_line) {
1956
0
  if (dst->empty()) {
1957
0
    dst->append(src);
1958
0
    CleanStringLineEndings(dst, auto_end_last_line);
1959
0
  } else {
1960
0
    string tmp = src;
1961
0
    CleanStringLineEndings(&tmp, auto_end_last_line);
1962
0
    dst->append(tmp);
1963
0
  }
1964
0
}
1965
1966
0
void CleanStringLineEndings(string* str, bool auto_end_last_line) {
1967
0
  size_t output_pos = 0;
1968
0
  bool r_seen = false;
1969
0
  auto len = str->size();
1970
1971
0
  char* p = string_as_array(str);
1972
1973
0
  for (size_t input_pos = 0; input_pos < len;) {
1974
0
    if (!r_seen && input_pos + 8 < len) {
1975
0
      uint64 v = UNALIGNED_LOAD64(p + input_pos);
1976
      // Loop over groups of 8 bytes at a time until we come across
1977
      // a word that has a byte whose value is less than or equal to
1978
      // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
1979
      //
1980
      // We use a has_less macro that quickly tests a whole 64-bit
1981
      // word to see if any of the bytes has a value < N.
1982
      //
1983
      // For more details, see:
1984
      //   http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
1985
0
#define has_less(x, n) (((x)-~0ULL/255*(n))&~(x)&~0ULL/255*128)
1986
0
      if (!has_less(v, '\r' + 1)) {
1987
0
#undef has_less
1988
        // No byte in this word has a value that could be a \r or a \n
1989
0
        if (output_pos != input_pos)
1990
0
          UNALIGNED_STORE64(p + output_pos, v);
1991
0
        input_pos += 8;
1992
0
        output_pos += 8;
1993
0
        continue;
1994
0
      }
1995
0
    }
1996
0
    string::const_reference in = p[input_pos];
1997
0
    if (in == '\r') {
1998
0
      if (r_seen)
1999
0
        p[output_pos++] = '\n';
2000
0
      r_seen = true;
2001
0
    } else if (in == '\n') {
2002
0
      if (input_pos != output_pos)
2003
0
        p[output_pos++] = '\n';
2004
0
      else
2005
0
        output_pos++;
2006
0
      r_seen = false;
2007
0
    } else {
2008
0
      if (r_seen)
2009
0
        p[output_pos++] = '\n';
2010
0
      r_seen = false;
2011
0
      if (input_pos != output_pos)
2012
0
        p[output_pos++] = in;
2013
0
      else
2014
0
        output_pos++;
2015
0
    }
2016
0
    input_pos++;
2017
0
  }
2018
0
  if (r_seen || (auto_end_last_line
2019
0
                 && output_pos > 0
2020
0
                 && p[output_pos - 1] != '\n')) {
2021
0
    str->resize(output_pos + 1);
2022
0
    str->operator[](output_pos) = '\n';
2023
0
  } else if (output_pos < len) {
2024
0
    str->resize(output_pos);
2025
0
  }
2026
0
}
2027
2028
2029
}  // namespace strings