/Users/deen/code/yugabyte-db/src/yb/yql/cql/ql/parser/scanner.h
Line | Count | Source (jump to first uncovered line) |
1 | | //-------------------------------------------------------------------------------------------------- |
2 | | // The following only applies to changes made to this file as part of YugaByte development. |
3 | | // |
4 | | // Copyright (c) YugaByte, Inc. |
5 | | // |
6 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
7 | | // in compliance with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
12 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
13 | | // or implied. See the License for the specific language governing permissions and limitations |
14 | | // under the License. |
15 | | // |
16 | | // Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group |
17 | | // Portions Copyright (c) 1994, Regents of the University of California |
18 | | // |
19 | | // API for the scanner. |
20 | | // |
21 | | // The core scanner is also used by PL/pgsql, so we provide a public API |
22 | | // for it. However, the rest of the backend is only expected to use the |
23 | | // higher-level API provided by parser.h. |
24 | | //-------------------------------------------------------------------------------------------------- |
25 | | #ifndef YB_YQL_CQL_QL_PARSER_SCANNER_H_ |
26 | | #define YB_YQL_CQL_QL_PARSER_SCANNER_H_ |
27 | | |
28 | | #include <cstddef> |
29 | | #include <cstdint> |
30 | | |
31 | | // Include base lexer class. FLEX might or might not "include <FlexLexer.h>" in its code and |
32 | | // generated code. The macro "yyFlexLexerOnce" is used here to guard duplicate includes. |
33 | | #ifndef yyFlexLexerOnce |
34 | | #include <FlexLexer.h> |
35 | | #endif |
36 | | |
37 | | #include "yb/util/memory/mc_types.h" |
38 | | |
39 | | #include "yb/yql/cql/ql/parser/parse_context.h" |
40 | | |
41 | | // Include auto-generated file from YACC. |
42 | | #include "yb/yql/cql/ql/parser/parser_gram.y.final.hh" |
43 | | |
44 | | namespace yb { |
45 | | namespace ql { |
46 | | |
47 | | //-------------------------------------------------------------------------------------------------- |
48 | | // Various declarations that are used for keyword, identifier, and text. |
49 | | //-------------------------------------------------------------------------------------------------- |
50 | | |
51 | | // Unicode support. |
52 | | typedef uint32_t pg_wchar; |
53 | | |
54 | | // Maximum length for identifiers (e.g. table names, column names, function names). Names actually |
55 | | // are limited to one less byte than this, because the length must include a trailing zero byte. |
56 | | // |
57 | | // Changing this requires an initdb. |
58 | | constexpr int NAMEDATALEN = 64; |
59 | | |
60 | | // UTF bit. |
61 | | constexpr int UTF_HIGHBIT = 0x80; |
62 | 0 | constexpr int is_utf_highbit_set(unsigned char ch) { return (ch & UTF_HIGHBIT) != 0; } |
63 | | |
64 | | // Keywords. |
65 | | class ScanKeyword { |
66 | | public: |
67 | | //------------------------------------------------------------------------------------------------ |
68 | | // Keyword categories. The value in this enum is used to characterize the keywords into different |
69 | | // groups. The group that a keyword belongs to must match with their definitions in the file |
70 | | // "parser_gram.y". |
71 | | enum class KeywordCategory : int16_t { |
72 | | UNRESERVED_KEYWORD = 0, |
73 | | COL_NAME_KEYWORD, |
74 | | TYPE_FUNC_NAME_KEYWORD, |
75 | | RESERVED_KEYWORD, |
76 | | INVALID_KEYWORD, |
77 | | }; |
78 | | |
79 | | //------------------------------------------------------------------------------------------------ |
80 | | // Public functions. |
81 | | ScanKeyword(const char* name, GramProcessor::token_type value, KeywordCategory category) |
82 | 4.25M | : name_(name), value_(value), category_(category) { |
83 | 4.25M | } |
84 | | |
85 | 2.22M | bool is_valid() const { |
86 | 2.22M | return category_ != KeywordCategory::INVALID_KEYWORD; |
87 | 2.22M | } |
88 | | |
89 | 1.25M | GramProcessor::token_type token() const { |
90 | 1.25M | return static_cast<GramProcessor::token_type>(value_); |
91 | 1.25M | } |
92 | | |
93 | 1.25M | const char* name() const { |
94 | 1.25M | return name_; |
95 | 1.25M | } |
96 | | |
97 | | private: |
98 | | //------------------------------------------------------------------------------------------------ |
99 | | const char *name_; // Name in lower case. |
100 | | int16_t value_; // Grammar's token code. |
101 | | KeywordCategory category_; // See codes above for different keyword categories. |
102 | | }; |
103 | | |
104 | | // Scan state. |
105 | | // A token might require multiple scans, and each of these calls might be passed a different |
106 | | // ScanState. This callstack variable was needed in PostgreQL C-code, but we might not needed in |
107 | | // our C++ code. |
108 | | class ScanState { |
109 | | public: |
110 | | //------------------------------------------------------------------------------------------------ |
111 | | // Public types. |
112 | | typedef std::unique_ptr<ScanState> UniPtr; |
113 | | typedef std::unique_ptr<const ScanState> UniPtrConst; |
114 | | |
115 | | //------------------------------------------------------------------------------------------------ |
116 | | // Public functions. |
117 | | ScanState(); |
118 | | virtual ~ScanState(); |
119 | | }; |
120 | | |
121 | | // LexProcessor class. |
122 | | class LexProcessor : public yyFlexLexer { |
123 | | public: |
124 | | //------------------------------------------------------------------------------------------------ |
125 | | // Public types. |
126 | | typedef std::unique_ptr<LexProcessor> UniPtr; |
127 | | typedef std::unique_ptr<const LexProcessor> UniPtrConst; |
128 | | |
129 | | //------------------------------------------------------------------------------------------------ |
130 | | // Public functions. |
131 | | // Constructor and destructor. |
132 | | LexProcessor(); |
133 | | virtual ~LexProcessor(); |
134 | | |
135 | | // Reset all scanning state variables such that processing a SQL statement should not be affected |
136 | | // by the erroneous state of the precedent statements. |
137 | | void ScanInit(ParseContext *parse_context); |
138 | | |
139 | | // Memory pool for allocating and deallocating operating memory spaces during parsing process. |
140 | 0 | MemoryContext *PTempMem() const { |
141 | 0 | return parse_context_->PTempMem(); |
142 | 0 | } |
143 | | |
144 | | // Memory pool for constructing the parse tree of a statement. |
145 | 1.35M | MemoryContext *PTreeMem() const { |
146 | 1.35M | return parse_context_->PTreeMem(); |
147 | 1.35M | } |
148 | | |
149 | | // Entry point for lexical analysis. Scanns and return one token at a time. This is a wrapper |
150 | | // around yylex(), and it might call yylex more than once to process a token. |
151 | | GramProcessor::symbol_type Scan(); |
152 | | |
153 | | // Counts number of newline characters in the current token and set token location accordingly. |
154 | | void CountNewlineInToken(const std::string& token); |
155 | | |
156 | | // Reports error and returns SCAN_ERROR to instruct the parser to stop the parsing process. |
157 | | GramProcessor::symbol_type ScanError(const char *token); |
158 | | GramProcessor::symbol_type ScanError(const char *message, ErrorCode errcode); |
159 | | |
160 | | // Read literal value during a scan and convert it to MCString. |
161 | | MCSharedPtr<MCString> ScanLiteral(); |
162 | | |
163 | | // Access function for current token location. |
164 | 0 | const location &token_loc() const { |
165 | 0 | return token_loc_; |
166 | 0 | } |
167 | | |
168 | 1.50M | GramProcessor::symbol_type make_symbol(int16_t token, location l) { |
169 | 1.50M | return GramProcessor::symbol_type(static_cast<GramProcessor::token_type>(token), std::move(l)); |
170 | 1.50M | } |
171 | | |
172 | 1.24M | GramProcessor::symbol_type make_symbol(const ScanKeyword &keyword, location l) { |
173 | 1.24M | return GramProcessor::symbol_type(keyword.token(), keyword.name(), std::move(l)); |
174 | 1.24M | } |
175 | | |
176 | | private: |
177 | | //------------------------------------------------------------------------------------------------ |
178 | | // Private types. |
179 | | enum class BackslashQuoteType { |
180 | | OFF, |
181 | | ON, |
182 | | SAFE_ENCODING |
183 | | }; |
184 | | |
185 | | //------------------------------------------------------------------------------------------------ |
186 | | // Private functions. |
187 | | // Returns a valid keyword value if it exists. Otherwise, returns an invalid value. |
188 | | static const ScanKeyword& ScanKeywordLookup(const char *text); |
189 | | |
190 | | // The following line lets the compilers know that we know of the existing yylex() from FLEX base |
191 | | // class, but we intend to define and use our own yylex(). |
192 | | using yyFlexLexer::yylex; |
193 | | |
194 | | // Run lexical analysis. |
195 | | GramProcessor::symbol_type yylex(const ScanState& scan_state); |
196 | | |
197 | | // Returns the number of bytes that was read from the input stream or string. Lexer will call |
198 | | // this function to collect tokens from the input. |
199 | | int LexerInput(char* buf, int max_size) override; |
200 | | |
201 | | // Scans the input statement for the next token. |
202 | | void ScanNextToken(const ScanState& scan_state, GramProcessor::symbol_type *next_token); |
203 | | |
204 | | // Converts text into MCString and truncates it to allowable length, NAMEDATALEN, if needed. |
205 | | MCSharedPtr<MCString> MakeIdentifier(const char *text, int len, bool warn); |
206 | | |
207 | | // Truncates identifier to allowable length, NAMEDATALEN, if necessary. |
208 | | void TruncateIdentifier(const MCSharedPtr<MCString>& ident, bool warn); |
209 | | |
210 | | // Converts a char* to MCString. |
211 | 165k | MCSharedPtr<MCString> MakeString(const char *str) { |
212 | 165k | return MCMakeShared<MCString>(PTreeMem(), str); |
213 | 165k | } |
214 | | |
215 | | // Advance current token location by the given number of bytes. |
216 | 0 | void AdvanceCursor(int bytes) { |
217 | 0 | cursor_ += bytes; |
218 | 0 | } |
219 | | |
220 | | //------------------------------------------------------------------------------------------------ |
221 | | // NOTE: All entities below this line in this modules are copies of PostgreQL's code. We made |
222 | | // some minor changes to avoid lint errors such as using '{' for if blocks, change the comment |
223 | | // style from '/**/' to '//', and post-fix data members with "_". |
224 | | //------------------------------------------------------------------------------------------------ |
225 | | // Operations on literal buffers. |
226 | | void EnlargeLiteralBuf(size_t bytes); |
227 | | void startlit(); |
228 | | void addlit(char *ytext, size_t yleng); |
229 | | void addlitchar(unsigned char ychar); |
230 | | char *litbuf_udeescape(unsigned char escape); |
231 | | |
232 | | // Unicode support. |
233 | | unsigned char unescape_single_char(unsigned char c); |
234 | | void addunicode(pg_wchar c); |
235 | | void check_string_escape_warning(unsigned char ychar); |
236 | | void check_escape_warning(); |
237 | | |
238 | | //------------------------------------------------------------------------------------------------ |
239 | | // The context in which the scanning process is running. |
240 | | ParseContext *parse_context_; |
241 | | |
242 | | // The rest of this class are scanning state variables, which are declared or used by PostgreQL |
243 | | // structures, functions, and operations. |
244 | | location token_loc_; // Current token location. |
245 | | GramProcessor::symbol_type lookahead_; // Lookahead token. |
246 | | location cursor_; // The current scanning location. |
247 | | |
248 | | // Literalbuf is used to accumulate literal values when multiple rules are needed to parse a |
249 | | // single literal. Call startlit() to reset buffer to empty, addlit() to add text. |
250 | | // NOTE: The string in literalbuf is NOT necessarily null-terminated, but there always IS room to |
251 | | // add a trailing null at offset literallen. We store a null only when we need it. |
252 | | char *literalbuf_; // Temporary buffer for literal. |
253 | | size_t literallen_; // Temporary buffer length. |
254 | | size_t literalalloc_; // Temporary buffer size. |
255 | | int xcdepth_; // Depth of nesting in slash-star comments. |
256 | | char *dolqstart_; // Current $foo$ quote start string. |
257 | | int32_t utf16_first_part_; // First of UTF16 surrogate unicode escape pair. |
258 | | bool warn_on_first_escape_; // Literal-lexing warning for escape. |
259 | | bool saw_non_ascii_; // Literal-lexing warning for non ascii. |
260 | | |
261 | | // Scanner settings to use. These are initialized from the corresponding GUC variables by |
262 | | // scanner_init(). Callers can modify them after scanner_init() if they don't want the scanner's |
263 | | // behavior to follow the prevailing GUC settings. |
264 | | BackslashQuoteType backslash_quote_; // State when scaning backslash. |
265 | | bool escape_string_warning_; // State when scaning escape. |
266 | | bool standard_conforming_strings_; // State when scaning standard string. |
267 | | }; |
268 | | |
269 | | } // namespace ql |
270 | | } // namespace yb |
271 | | |
272 | | #endif // YB_YQL_CQL_QL_PARSER_SCANNER_H_ |