/Users/deen/code/yugabyte-db/src/yb/yql/cql/ql/parser/scanner_util.cc
Line | Count | Source (jump to first uncovered line) |
1 | | //-------------------------------------------------------------------------------------------------- |
2 | | // NOTE: All entities in this modules are copies of PostgreQL's code. We made some minor changes |
3 | | // to avoid lint errors such as using '{' for if blocks and change the comment style from '/**/' |
4 | | // to '//'. |
5 | | //-------------------------------------------------------------------------------------------------- |
6 | | |
7 | | //-------------------------------------------------------------------------------------------------- |
8 | | // The following only applies to changes made to this file as part of YugaByte development. |
9 | | // |
10 | | // Copyright (c) YugaByte, Inc. |
11 | | // |
12 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
13 | | // in compliance with the License. You may obtain a copy of the License at |
14 | | // |
15 | | // http://www.apache.org/licenses/LICENSE-2.0 |
16 | | // |
17 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
18 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
19 | | // or implied. See the License for the specific language governing permissions and limitations |
20 | | // under the License. |
21 | | // |
22 | | // Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group |
23 | | // Portions Copyright (c) 1994, Regents of the University of California |
24 | | // |
25 | | // Scanning utility functions. |
26 | | //-------------------------------------------------------------------------------------------------- |
27 | | #include "yb/yql/cql/ql/parser/scanner_util.h" |
28 | | |
29 | | #include <algorithm> |
30 | | |
31 | | #include "yb/gutil/macros.h" |
32 | | |
33 | | namespace yb { |
34 | | namespace ql { |
35 | | |
36 | | using std::min; |
37 | | |
38 | | //-------------------------------------------------------------------------------------------------- |
39 | | |
40 | 0 | unsigned int hexval(unsigned char c) { |
41 | 0 | if (c >= '0' && c <= '9') |
42 | 0 | return c - '0'; |
43 | 0 | if (c >= 'a' && c <= 'f') |
44 | 0 | return c - 'a' + 0xA; |
45 | 0 | if (c >= 'A' && c <= 'F') |
46 | 0 | return c - 'A' + 0xA; |
47 | | |
48 | 0 | LOG(ERROR) << "invalid hexadecimal digit"; |
49 | 0 | return 0; /* not reached */ |
50 | 0 | } |
51 | | |
52 | | //-------------------------------------------------------------------------------------------------- |
53 | | |
54 | 0 | void downcase_truncate_identifier(char *result, const char *ident, int len, bool warn) { |
55 | 0 | int i; |
56 | | |
57 | | // SQL99 specifies Unicode-aware case normalization, which we don't yet |
58 | | // have the infrastructure for. Instead we use tolower() to provide a |
59 | | // locale-aware translation. However, there are some locales where this |
60 | | // is not right either (eg, Turkish may do strange things with 'i' and |
61 | | // 'I'). Our current compromise is to use tolower() for characters with |
62 | | // the high bit set, as long as they aren't part of a multi-byte |
63 | | // character, and use an ASCII-only downcasing for 7-bit characters. |
64 | 0 | for (i = 0; i < len; i++) { |
65 | 0 | unsigned char ch = static_cast<unsigned char>(ident[i]); |
66 | |
|
67 | 0 | if (ch >= 'A' && ch <= 'Z') { |
68 | 0 | ch += 'a' - 'A'; |
69 | 0 | } |
70 | 0 | result[i] = static_cast<char>(ch); |
71 | 0 | } |
72 | 0 | result[i] = '\0'; |
73 | |
|
74 | 0 | if (i >= NAMEDATALEN) { |
75 | 0 | truncate_identifier(result, i, warn); |
76 | 0 | } |
77 | 0 | } |
78 | | |
79 | 0 | void truncate_identifier(char *ident, size_t len, bool warn) { |
80 | 0 | if (len >= NAMEDATALEN) { |
81 | 0 | len = pg_encoding_mbcliplen(ident, len, NAMEDATALEN - 1); |
82 | 0 | if (warn) { |
83 | | // We avoid using %.*s here because it can misbehave if the data |
84 | | // is not valid in what libc thinks is the prevailing encoding. |
85 | 0 | char buf[NAMEDATALEN]; |
86 | |
|
87 | 0 | memcpy(buf, ident, len); |
88 | 0 | buf[len] = '\0'; |
89 | 0 | LOG(WARNING) << "SQL Warning: " << ErrorText(ErrorCode::NAME_TOO_LONG) |
90 | 0 | << "Identifier " << ident << " will be truncated to " << buf; |
91 | 0 | } |
92 | 0 | ident[len] = '\0'; |
93 | 0 | } |
94 | 0 | } |
95 | | |
96 | | //-------------------------------------------------------------------------------------------------- |
97 | | |
98 | 0 | bool scanner_isspace(char ch) { |
99 | | // This must match scan.l's list of {space} characters. |
100 | 0 | return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f'); |
101 | 0 | } |
102 | | |
103 | 0 | bool check_uescapechar(unsigned char escape) { |
104 | 0 | if (isxdigit(escape) |
105 | 0 | || escape == '+' |
106 | 0 | || escape == '\'' |
107 | 0 | || escape == '"' |
108 | 0 | || scanner_isspace(escape)) { |
109 | 0 | return false; |
110 | 0 | } else { |
111 | 0 | return true; |
112 | 0 | } |
113 | 0 | } |
114 | | |
115 | 0 | void check_unicode_value(pg_wchar c, char *loc) { |
116 | 0 | } |
117 | | |
118 | 0 | unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string) { |
119 | 0 | if (c <= 0x7F) { |
120 | 0 | utf8string[0] = c; |
121 | 0 | } else if (c <= 0x7FF) { |
122 | 0 | utf8string[0] = 0xC0 | ((c >> 6) & 0x1F); |
123 | 0 | utf8string[1] = 0x80 | (c & 0x3F); |
124 | 0 | } else if (c <= 0xFFFF) { |
125 | 0 | utf8string[0] = 0xE0 | ((c >> 12) & 0x0F); |
126 | 0 | utf8string[1] = 0x80 | ((c >> 6) & 0x3F); |
127 | 0 | utf8string[2] = 0x80 | (c & 0x3F); |
128 | 0 | } else { |
129 | 0 | utf8string[0] = 0xF0 | ((c >> 18) & 0x07); |
130 | 0 | utf8string[1] = 0x80 | ((c >> 12) & 0x3F); |
131 | 0 | utf8string[2] = 0x80 | ((c >> 6) & 0x3F); |
132 | 0 | utf8string[3] = 0x80 | (c & 0x3F); |
133 | 0 | } |
134 | |
|
135 | 0 | return utf8string; |
136 | 0 | } |
137 | | |
138 | | //-------------------------------------------------------------------------------------------------- |
139 | | |
140 | 0 | bool is_utf16_surrogate_first(pg_wchar c) { |
141 | 0 | return (c >= 0xD800 && c <= 0xDBFF); |
142 | 0 | } |
143 | | |
144 | 0 | bool is_utf16_surrogate_second(pg_wchar c) { |
145 | 0 | return (c >= 0xDC00 && c <= 0xDFFF); |
146 | 0 | } |
147 | | |
148 | 0 | pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) { |
149 | 0 | return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); |
150 | 0 | } |
151 | | |
152 | | //-------------------------------------------------------------------------------------------------- |
153 | | |
154 | 0 | size_t pg_utf_mblen(const unsigned char *s) { |
155 | 0 | if ((*s & 0x80) == 0) |
156 | 0 | return 1; |
157 | 0 | else if ((*s & 0xe0) == 0xc0) |
158 | 0 | return 2; |
159 | 0 | else if ((*s & 0xf0) == 0xe0) |
160 | 0 | return 3; |
161 | 0 | else if ((*s & 0xf8) == 0xf0) |
162 | 0 | return 4; |
163 | | #ifdef NOT_USED |
164 | | else if ((*s & 0xfc) == 0xf8) |
165 | | return 5; |
166 | | else if ((*s & 0xfe) == 0xfc) |
167 | | return 6; |
168 | | #endif |
169 | 0 | else |
170 | 0 | return 1; |
171 | 0 | } |
172 | | |
173 | 0 | size_t pg_mbstrlen_with_len(const char *mbstr, size_t limit) { |
174 | 0 | size_t len = 0; |
175 | |
|
176 | 0 | while (limit > 0 && *mbstr) { |
177 | 0 | auto l = pg_utf_mblen((const unsigned char *)mbstr); |
178 | |
|
179 | 0 | limit -= l; |
180 | 0 | mbstr += l; |
181 | 0 | len++; |
182 | 0 | } |
183 | 0 | return len; |
184 | 0 | } |
185 | | |
186 | 0 | size_t pg_verify_mbstr_len(const char *mbstr, size_t len, bool noError) { |
187 | 0 | size_t mb_len = 0; |
188 | 0 | while (len > 0) { |
189 | | /* fast path for ASCII-subset characters */ |
190 | 0 | if (!is_utf_highbit_set(static_cast<unsigned char>(*mbstr))) { |
191 | 0 | if (*mbstr != '\0') { |
192 | 0 | mb_len++; |
193 | 0 | mbstr++; |
194 | 0 | len--; |
195 | 0 | continue; |
196 | 0 | } |
197 | 0 | if (noError) { |
198 | 0 | return -1; |
199 | 0 | } |
200 | 0 | report_invalid_encoding(mbstr, len); |
201 | 0 | } |
202 | | |
203 | 0 | auto l = pg_utf8_verifier((const unsigned char *) mbstr, len); |
204 | |
|
205 | 0 | if (l < 0) { |
206 | 0 | if (noError) |
207 | 0 | return -1; |
208 | 0 | report_invalid_encoding(mbstr, len); |
209 | 0 | } |
210 | | |
211 | 0 | mbstr += l; |
212 | 0 | len -= l; |
213 | 0 | mb_len++; |
214 | 0 | } |
215 | 0 | return mb_len; |
216 | 0 | } |
217 | | |
218 | | //-------------------------------------------------------------------------------------------------- |
219 | | |
220 | 0 | void report_invalid_encoding(const char *mbstr, size_t len) { |
221 | 0 | auto l = pg_utf_mblen((const unsigned char *)mbstr); |
222 | 0 | char buf[8 * 5 + 1]; |
223 | 0 | char *p = buf; |
224 | |
|
225 | 0 | auto jlimit = min(l, len); |
226 | 0 | jlimit = min<size_t>(jlimit, 8); /* prevent buffer overrun */ |
227 | | |
228 | | // The following NOLINTs are used as I tried to leave PostgreQL's code as is. Eventually, when we |
229 | | // use our own error reporting for QL interface, the NOLINTs should be gone then. |
230 | 0 | for (size_t j = 0; j < jlimit; j++) { |
231 | 0 | p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]); // NOLINT(*) |
232 | 0 | if (j < jlimit - 1) |
233 | 0 | p += sprintf(p, " "); // NOLINT(*) |
234 | 0 | } |
235 | |
|
236 | 0 | LOG(ERROR) << "SQL Error: " << ErrorText(ErrorCode::CHARACTER_NOT_IN_REPERTOIRE) |
237 | 0 | << ". Invalid byte sequence for UTF8 \"" << buf << "\""; |
238 | 0 | } |
239 | | |
240 | | //-------------------------------------------------------------------------------------------------- |
241 | | |
242 | 0 | ssize_t pg_utf8_verifier(const unsigned char *s, size_t len) { |
243 | 0 | auto l = pg_utf_mblen(s); |
244 | |
|
245 | 0 | if (len < l) |
246 | 0 | return -1; |
247 | | |
248 | 0 | if (!pg_utf8_islegal(s, l)) |
249 | 0 | return -1; |
250 | | |
251 | 0 | return l; |
252 | 0 | } |
253 | | |
254 | | //-------------------------------------------------------------------------------------------------- |
255 | | |
256 | 0 | bool pg_utf8_islegal(const unsigned char *source, size_t length) { |
257 | 0 | unsigned char a; |
258 | |
|
259 | 0 | switch (length) { |
260 | 0 | default: |
261 | | /* reject lengths 5 and 6 for now */ |
262 | 0 | return false; |
263 | | |
264 | 0 | case 4: |
265 | 0 | a = source[3]; |
266 | 0 | if (a < 0x80 || a > 0xBF) |
267 | 0 | return false; |
268 | 0 | FALLTHROUGH_INTENDED; |
269 | | |
270 | 0 | case 3: |
271 | 0 | a = source[2]; |
272 | 0 | if (a < 0x80 || a > 0xBF) |
273 | 0 | return false; |
274 | 0 | FALLTHROUGH_INTENDED; |
275 | | |
276 | 0 | case 2: |
277 | 0 | a = source[1]; |
278 | 0 | switch (*source) { |
279 | 0 | case 0xE0: |
280 | 0 | if (a < 0xA0 || a > 0xBF) |
281 | 0 | return false; |
282 | 0 | break; |
283 | 0 | case 0xED: |
284 | 0 | if (a < 0x80 || a > 0x9F) |
285 | 0 | return false; |
286 | 0 | break; |
287 | 0 | case 0xF0: |
288 | 0 | if (a < 0x90 || a > 0xBF) |
289 | 0 | return false; |
290 | 0 | break; |
291 | 0 | case 0xF4: |
292 | 0 | if (a < 0x80 || a > 0x8F) |
293 | 0 | return false; |
294 | 0 | break; |
295 | 0 | default: |
296 | 0 | if (a < 0x80 || a > 0xBF) |
297 | 0 | return false; |
298 | 0 | break; |
299 | 0 | } |
300 | 0 | FALLTHROUGH_INTENDED; |
301 | | |
302 | 0 | case 1: |
303 | 0 | a = *source; |
304 | 0 | if (a >= 0x80 && a < 0xC2) |
305 | 0 | return false; |
306 | 0 | if (a > 0xF4) |
307 | 0 | return false; |
308 | 0 | break; |
309 | 0 | } |
310 | 0 | return true; |
311 | 0 | } |
312 | | |
313 | | //-------------------------------------------------------------------------------------------------- |
314 | | |
315 | 0 | size_t pg_encoding_mbcliplen(const char *mbstr, size_t len, size_t limit) { |
316 | 0 | size_t clen = 0; |
317 | |
|
318 | 0 | while (clen < len && *mbstr) { |
319 | 0 | auto l = pg_utf_mblen((const unsigned char *) mbstr); |
320 | 0 | if ((clen + l) > limit) |
321 | 0 | break; |
322 | 0 | clen += l; |
323 | 0 | if (clen == limit) |
324 | 0 | break; |
325 | 0 | mbstr += l; |
326 | 0 | } |
327 | 0 | return clen; |
328 | 0 | } |
329 | | |
330 | | } // namespace ql |
331 | | } // namespace yb |