YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/yql/cql/ql/parser/scanner_util.cc
Line
Count
Source (jump to first uncovered line)
1
//--------------------------------------------------------------------------------------------------
2
// NOTE: All entities in this modules are copies of PostgreQL's code. We made some minor changes
3
// to avoid lint errors such as using '{' for if blocks and change the comment style from '/**/'
4
// to '//'.
5
//--------------------------------------------------------------------------------------------------
6
7
//--------------------------------------------------------------------------------------------------
8
// The following only applies to changes made to this file as part of YugaByte development.
9
//
10
// Copyright (c) YugaByte, Inc.
11
//
12
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
13
// in compliance with the License.  You may obtain a copy of the License at
14
//
15
// http://www.apache.org/licenses/LICENSE-2.0
16
//
17
// Unless required by applicable law or agreed to in writing, software distributed under the License
18
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
19
// or implied.  See the License for the specific language governing permissions and limitations
20
// under the License.
21
//
22
// Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
23
// Portions Copyright (c) 1994, Regents of the University of California
24
//
25
// Scanning utility functions.
26
//--------------------------------------------------------------------------------------------------
27
#include "yb/yql/cql/ql/parser/scanner_util.h"
28
29
#include <algorithm>
30
31
#include "yb/gutil/macros.h"
32
33
namespace yb {
34
namespace ql {
35
36
using std::min;
37
38
//--------------------------------------------------------------------------------------------------
39
40
0
unsigned int hexval(unsigned char c) {
41
0
  if (c >= '0' && c <= '9')
42
0
    return c - '0';
43
0
  if (c >= 'a' && c <= 'f')
44
0
    return c - 'a' + 0xA;
45
0
  if (c >= 'A' && c <= 'F')
46
0
    return c - 'A' + 0xA;
47
48
0
  LOG(ERROR) << "invalid hexadecimal digit";
49
0
  return 0; /* not reached */
50
0
}
51
52
//--------------------------------------------------------------------------------------------------
53
54
0
void downcase_truncate_identifier(char *result, const char *ident, int len, bool warn) {
55
0
  int i;
56
57
  // SQL99 specifies Unicode-aware case normalization, which we don't yet
58
  // have the infrastructure for.  Instead we use tolower() to provide a
59
  // locale-aware translation.  However, there are some locales where this
60
  // is not right either (eg, Turkish may do strange things with 'i' and
61
  // 'I').  Our current compromise is to use tolower() for characters with
62
  // the high bit set, as long as they aren't part of a multi-byte
63
  // character, and use an ASCII-only downcasing for 7-bit characters.
64
0
  for (i = 0; i < len; i++) {
65
0
    unsigned char ch = static_cast<unsigned char>(ident[i]);
66
67
0
    if (ch >= 'A' && ch <= 'Z') {
68
0
      ch += 'a' - 'A';
69
0
    }
70
0
    result[i] = static_cast<char>(ch);
71
0
  }
72
0
  result[i] = '\0';
73
74
0
  if (i >= NAMEDATALEN) {
75
0
    truncate_identifier(result, i, warn);
76
0
  }
77
0
}
78
79
0
void truncate_identifier(char *ident, size_t len, bool warn) {
80
0
  if (len >= NAMEDATALEN) {
81
0
    len = pg_encoding_mbcliplen(ident, len, NAMEDATALEN - 1);
82
0
    if (warn) {
83
      // We avoid using %.*s here because it can misbehave if the data
84
      // is not valid in what libc thinks is the prevailing encoding.
85
0
      char buf[NAMEDATALEN];
86
87
0
      memcpy(buf, ident, len);
88
0
      buf[len] = '\0';
89
0
      LOG(WARNING) << "SQL Warning: " << ErrorText(ErrorCode::NAME_TOO_LONG)
90
0
                   << "Identifier " << ident << " will be truncated to " << buf;
91
0
    }
92
0
    ident[len] = '\0';
93
0
  }
94
0
}
95
96
//--------------------------------------------------------------------------------------------------
97
98
0
bool scanner_isspace(char ch) {
99
  // This must match scan.l's list of {space} characters.
100
0
  return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f');
101
0
}
102
103
0
bool check_uescapechar(unsigned char escape) {
104
0
  if (isxdigit(escape)
105
0
    || escape == '+'
106
0
    || escape == '\''
107
0
    || escape == '"'
108
0
    || scanner_isspace(escape)) {
109
0
    return false;
110
0
  } else {
111
0
    return true;
112
0
  }
113
0
}
114
115
0
void check_unicode_value(pg_wchar c, char *loc) {
116
0
}
117
118
0
unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string) {
119
0
  if (c <= 0x7F) {
120
0
    utf8string[0] = c;
121
0
  } else if (c <= 0x7FF) {
122
0
    utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
123
0
    utf8string[1] = 0x80 | (c & 0x3F);
124
0
  } else if (c <= 0xFFFF) {
125
0
    utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
126
0
    utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
127
0
    utf8string[2] = 0x80 | (c & 0x3F);
128
0
  } else {
129
0
    utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
130
0
    utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
131
0
    utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
132
0
    utf8string[3] = 0x80 | (c & 0x3F);
133
0
  }
134
135
0
  return utf8string;
136
0
}
137
138
//--------------------------------------------------------------------------------------------------
139
140
0
bool is_utf16_surrogate_first(pg_wchar c) {
141
0
  return (c >= 0xD800 && c <= 0xDBFF);
142
0
}
143
144
0
bool is_utf16_surrogate_second(pg_wchar c) {
145
0
  return (c >= 0xDC00 && c <= 0xDFFF);
146
0
}
147
148
0
pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) {
149
0
  return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
150
0
}
151
152
//--------------------------------------------------------------------------------------------------
153
154
0
size_t pg_utf_mblen(const unsigned char *s) {
155
0
  if ((*s & 0x80) == 0)
156
0
    return 1;
157
0
  else if ((*s & 0xe0) == 0xc0)
158
0
    return 2;
159
0
  else if ((*s & 0xf0) == 0xe0)
160
0
    return 3;
161
0
  else if ((*s & 0xf8) == 0xf0)
162
0
    return 4;
163
#ifdef NOT_USED
164
  else if ((*s & 0xfc) == 0xf8)
165
    return 5;
166
  else if ((*s & 0xfe) == 0xfc)
167
    return 6;
168
#endif
169
0
  else
170
0
    return 1;
171
0
}
172
173
0
size_t pg_mbstrlen_with_len(const char *mbstr, size_t limit) {
174
0
  size_t len = 0;
175
176
0
  while (limit > 0 && *mbstr) {
177
0
    auto l = pg_utf_mblen((const unsigned char *)mbstr);
178
179
0
    limit -= l;
180
0
    mbstr += l;
181
0
    len++;
182
0
  }
183
0
  return len;
184
0
}
185
186
0
size_t pg_verify_mbstr_len(const char *mbstr, size_t len, bool noError) {
187
0
  size_t mb_len = 0;
188
0
  while (len > 0) {
189
    /* fast path for ASCII-subset characters */
190
0
    if (!is_utf_highbit_set(static_cast<unsigned char>(*mbstr))) {
191
0
      if (*mbstr != '\0') {
192
0
        mb_len++;
193
0
        mbstr++;
194
0
        len--;
195
0
        continue;
196
0
      }
197
0
      if (noError) {
198
0
        return -1;
199
0
      }
200
0
      report_invalid_encoding(mbstr, len);
201
0
    }
202
203
0
    auto l = pg_utf8_verifier((const unsigned char *) mbstr, len);
204
205
0
    if (l < 0) {
206
0
      if (noError)
207
0
        return -1;
208
0
      report_invalid_encoding(mbstr, len);
209
0
    }
210
211
0
    mbstr += l;
212
0
    len -= l;
213
0
    mb_len++;
214
0
  }
215
0
  return mb_len;
216
0
}
217
218
//--------------------------------------------------------------------------------------------------
219
220
0
void report_invalid_encoding(const char *mbstr, size_t len) {
221
0
  auto    l = pg_utf_mblen((const unsigned char *)mbstr);
222
0
  char    buf[8 * 5 + 1];
223
0
  char   *p = buf;
224
225
0
  auto jlimit = min(l, len);
226
0
  jlimit = min<size_t>(jlimit, 8);  /* prevent buffer overrun */
227
228
  // The following NOLINTs are used as I tried to leave PostgreQL's code as is. Eventually, when we
229
  // use our own error reporting for QL interface, the NOLINTs should be gone then.
230
0
  for (size_t j = 0; j < jlimit; j++) {
231
0
    p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);  // NOLINT(*)
232
0
    if (j < jlimit - 1)
233
0
      p += sprintf(p, " ");  // NOLINT(*)
234
0
  }
235
236
0
  LOG(ERROR) << "SQL Error: " << ErrorText(ErrorCode::CHARACTER_NOT_IN_REPERTOIRE)
237
0
             << ". Invalid byte sequence for UTF8 \"" << buf << "\"";
238
0
}
239
240
//--------------------------------------------------------------------------------------------------
241
242
0
ssize_t pg_utf8_verifier(const unsigned char *s, size_t len) {
243
0
  auto l = pg_utf_mblen(s);
244
245
0
  if (len < l)
246
0
    return -1;
247
248
0
  if (!pg_utf8_islegal(s, l))
249
0
    return -1;
250
251
0
  return l;
252
0
}
253
254
//--------------------------------------------------------------------------------------------------
255
256
0
bool pg_utf8_islegal(const unsigned char *source, size_t length) {
257
0
  unsigned char a;
258
259
0
  switch (length) {
260
0
  default:
261
    /* reject lengths 5 and 6 for now */
262
0
    return false;
263
264
0
  case 4:
265
0
    a = source[3];
266
0
    if (a < 0x80 || a > 0xBF)
267
0
      return false;
268
0
    FALLTHROUGH_INTENDED;
269
270
0
  case 3:
271
0
    a = source[2];
272
0
    if (a < 0x80 || a > 0xBF)
273
0
      return false;
274
0
    FALLTHROUGH_INTENDED;
275
276
0
  case 2:
277
0
    a = source[1];
278
0
    switch (*source) {
279
0
    case 0xE0:
280
0
      if (a < 0xA0 || a > 0xBF)
281
0
        return false;
282
0
      break;
283
0
    case 0xED:
284
0
      if (a < 0x80 || a > 0x9F)
285
0
        return false;
286
0
      break;
287
0
    case 0xF0:
288
0
      if (a < 0x90 || a > 0xBF)
289
0
        return false;
290
0
      break;
291
0
    case 0xF4:
292
0
      if (a < 0x80 || a > 0x8F)
293
0
        return false;
294
0
      break;
295
0
    default:
296
0
      if (a < 0x80 || a > 0xBF)
297
0
        return false;
298
0
      break;
299
0
    }
300
0
    FALLTHROUGH_INTENDED;
301
302
0
  case 1:
303
0
    a = *source;
304
0
    if (a >= 0x80 && a < 0xC2)
305
0
      return false;
306
0
    if (a > 0xF4)
307
0
      return false;
308
0
    break;
309
0
  }
310
0
  return true;
311
0
}
312
313
//--------------------------------------------------------------------------------------------------
314
315
0
size_t pg_encoding_mbcliplen(const char *mbstr, size_t len, size_t limit) {
316
0
  size_t     clen = 0;
317
318
0
  while (clen < len && *mbstr) {
319
0
    auto l = pg_utf_mblen((const unsigned char *) mbstr);
320
0
    if ((clen + l) > limit)
321
0
      break;
322
0
    clen += l;
323
0
    if (clen == limit)
324
0
      break;
325
0
    mbstr += l;
326
0
  }
327
0
  return clen;
328
0
}
329
330
}  // namespace ql
331
}  // namespace yb