YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/yql/cql/ql/parser/scanner.h
Line
Count
Source (jump to first uncovered line)
1
//--------------------------------------------------------------------------------------------------
2
// The following only applies to changes made to this file as part of YugaByte development.
3
//
4
// Copyright (c) YugaByte, Inc.
5
//
6
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
7
// in compliance with the License.  You may obtain a copy of the License at
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing, software distributed under the License
12
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
13
// or implied.  See the License for the specific language governing permissions and limitations
14
// under the License.
15
//
16
// Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
17
// Portions Copyright (c) 1994, Regents of the University of California
18
//
19
// API for the scanner.
20
//
21
// The core scanner is also used by PL/pgsql, so we provide a public API
22
// for it.  However, the rest of the backend is only expected to use the
23
// higher-level API provided by parser.h.
24
//--------------------------------------------------------------------------------------------------
25
#ifndef YB_YQL_CQL_QL_PARSER_SCANNER_H_
26
#define YB_YQL_CQL_QL_PARSER_SCANNER_H_
27
28
#include <cstddef>
29
#include <cstdint>
30
31
// Include base lexer class. FLEX might or might not "include <FlexLexer.h>" in its code and
32
// generated code. The macro "yyFlexLexerOnce" is used here to guard duplicate includes.
33
#ifndef yyFlexLexerOnce
34
#include <FlexLexer.h>
35
#endif
36
37
#include "yb/util/memory/mc_types.h"
38
39
#include "yb/yql/cql/ql/parser/parse_context.h"
40
41
// Include auto-generated file from YACC.
42
#include "yb/yql/cql/ql/parser/parser_gram.y.final.hh"
43
44
namespace yb {
45
namespace ql {
46
47
//--------------------------------------------------------------------------------------------------
48
// Various declarations that are used for keyword, identifier, and text.
49
//--------------------------------------------------------------------------------------------------
50
51
// Unicode support.
52
typedef uint32_t pg_wchar;
53
54
// Maximum length for identifiers (e.g. table names, column names, function names).  Names actually
55
// are limited to one less byte than this, because the length must include a trailing zero byte.
56
//
57
// Changing this requires an initdb.
58
constexpr int NAMEDATALEN = 64;
59
60
// UTF bit.
61
constexpr int UTF_HIGHBIT = 0x80;
62
0
constexpr int is_utf_highbit_set(unsigned char ch) { return (ch & UTF_HIGHBIT) != 0; }
63
64
// Keywords.
65
class ScanKeyword {
66
 public:
67
  //------------------------------------------------------------------------------------------------
68
  // Keyword categories. The value in this enum is used to characterize the keywords into different
69
  // groups. The group that a keyword belongs to must match with their definitions in the file
70
  // "parser_gram.y".
71
  enum class KeywordCategory : int16_t {
72
    UNRESERVED_KEYWORD = 0,
73
    COL_NAME_KEYWORD,
74
    TYPE_FUNC_NAME_KEYWORD,
75
    RESERVED_KEYWORD,
76
    INVALID_KEYWORD,
77
  };
78
79
  //------------------------------------------------------------------------------------------------
80
  // Public functions.
81
  ScanKeyword(const char* name, GramProcessor::token_type value, KeywordCategory category)
82
4.25M
      : name_(name), value_(value), category_(category) {
83
4.25M
  }
84
85
2.22M
  bool is_valid() const {
86
2.22M
    return category_ != KeywordCategory::INVALID_KEYWORD;
87
2.22M
  }
88
89
1.25M
  GramProcessor::token_type token() const {
90
1.25M
    return static_cast<GramProcessor::token_type>(value_);
91
1.25M
  }
92
93
1.25M
  const char* name() const {
94
1.25M
    return name_;
95
1.25M
  }
96
97
 private:
98
  //------------------------------------------------------------------------------------------------
99
  const char *name_;           // Name in lower case.
100
  int16_t value_;              // Grammar's token code.
101
  KeywordCategory category_;           // See codes above for different keyword categories.
102
};
103
104
// Scan state.
105
// A token might require multiple scans, and each of these calls might be passed a different
106
// ScanState. This callstack variable was needed in PostgreQL C-code, but we might not needed in
107
// our C++ code.
108
class ScanState {
109
 public:
110
  //------------------------------------------------------------------------------------------------
111
  // Public types.
112
  typedef std::unique_ptr<ScanState> UniPtr;
113
  typedef std::unique_ptr<const ScanState> UniPtrConst;
114
115
  //------------------------------------------------------------------------------------------------
116
  // Public functions.
117
  ScanState();
118
  virtual ~ScanState();
119
};
120
121
// LexProcessor class.
122
class LexProcessor : public yyFlexLexer {
123
 public:
124
  //------------------------------------------------------------------------------------------------
125
  // Public types.
126
  typedef std::unique_ptr<LexProcessor> UniPtr;
127
  typedef std::unique_ptr<const LexProcessor> UniPtrConst;
128
129
  //------------------------------------------------------------------------------------------------
130
  // Public functions.
131
  // Constructor and destructor.
132
  LexProcessor();
133
  virtual ~LexProcessor();
134
135
  // Reset all scanning state variables such that processing a SQL statement should not be affected
136
  // by the erroneous state of the precedent statements.
137
  void ScanInit(ParseContext *parse_context);
138
139
  // Memory pool for allocating and deallocating operating memory spaces during parsing process.
140
0
  MemoryContext *PTempMem() const {
141
0
    return parse_context_->PTempMem();
142
0
  }
143
144
  // Memory pool for constructing the parse tree of a statement.
145
1.35M
  MemoryContext *PTreeMem() const {
146
1.35M
    return parse_context_->PTreeMem();
147
1.35M
  }
148
149
  // Entry point for lexical analysis. Scanns and return one token at a time. This is a wrapper
150
  // around yylex(), and it might call yylex more than once to process a token.
151
  GramProcessor::symbol_type Scan();
152
153
  // Counts number of newline characters in the current token and set token location accordingly.
154
  void CountNewlineInToken(const std::string& token);
155
156
  // Reports error and returns SCAN_ERROR to instruct the parser to stop the parsing process.
157
  GramProcessor::symbol_type ScanError(const char *token);
158
  GramProcessor::symbol_type ScanError(const char *message, ErrorCode errcode);
159
160
  // Read literal value during a scan and convert it to MCString.
161
  MCSharedPtr<MCString> ScanLiteral();
162
163
  // Access function for current token location.
164
0
  const location &token_loc() const {
165
0
    return token_loc_;
166
0
  }
167
168
1.50M
  GramProcessor::symbol_type make_symbol(int16_t token, location l) {
169
1.50M
    return GramProcessor::symbol_type(static_cast<GramProcessor::token_type>(token), std::move(l));
170
1.50M
  }
171
172
1.24M
  GramProcessor::symbol_type make_symbol(const ScanKeyword &keyword, location l) {
173
1.24M
    return GramProcessor::symbol_type(keyword.token(), keyword.name(), std::move(l));
174
1.24M
  }
175
176
 private:
177
  //------------------------------------------------------------------------------------------------
178
  // Private types.
179
  enum class BackslashQuoteType {
180
    OFF,
181
    ON,
182
    SAFE_ENCODING
183
  };
184
185
  //------------------------------------------------------------------------------------------------
186
  // Private functions.
187
  // Returns a valid keyword value if it exists. Otherwise, returns an invalid value.
188
  static const ScanKeyword& ScanKeywordLookup(const char *text);
189
190
  // The following line lets the compilers know that we know of the existing yylex() from FLEX base
191
  // class, but we intend to define and use our own yylex().
192
  using yyFlexLexer::yylex;
193
194
  // Run lexical analysis.
195
  GramProcessor::symbol_type yylex(const ScanState& scan_state);
196
197
  // Returns the number of bytes that was read from the input stream or string. Lexer will call
198
  // this function to collect tokens from the input.
199
  int LexerInput(char* buf, int max_size) override;
200
201
  // Scans the input statement for the next token.
202
  void ScanNextToken(const ScanState& scan_state, GramProcessor::symbol_type *next_token);
203
204
  // Converts text into MCString and truncates it to allowable length, NAMEDATALEN, if needed.
205
  MCSharedPtr<MCString> MakeIdentifier(const char *text, int len, bool warn);
206
207
  // Truncates identifier to allowable length, NAMEDATALEN, if necessary.
208
  void TruncateIdentifier(const MCSharedPtr<MCString>& ident, bool warn);
209
210
  // Converts a char* to MCString.
211
165k
  MCSharedPtr<MCString> MakeString(const char *str) {
212
165k
    return MCMakeShared<MCString>(PTreeMem(), str);
213
165k
  }
214
215
  // Advance current token location by the given number of bytes.
216
0
  void AdvanceCursor(int bytes) {
217
0
    cursor_ += bytes;
218
0
  }
219
220
  //------------------------------------------------------------------------------------------------
221
  // NOTE: All entities below this line in this modules are copies of PostgreQL's code. We made
222
  // some minor changes to avoid lint errors such as using '{' for if blocks, change the comment
223
  // style from '/**/' to '//', and post-fix data members with "_".
224
  //------------------------------------------------------------------------------------------------
225
  // Operations on literal buffers.
226
  void EnlargeLiteralBuf(size_t bytes);
227
  void startlit();
228
  void addlit(char *ytext, size_t yleng);
229
  void addlitchar(unsigned char ychar);
230
  char *litbuf_udeescape(unsigned char escape);
231
232
  // Unicode support.
233
  unsigned char unescape_single_char(unsigned char c);
234
  void addunicode(pg_wchar c);
235
  void check_string_escape_warning(unsigned char ychar);
236
  void check_escape_warning();
237
238
  //------------------------------------------------------------------------------------------------
239
  // The context in which the scanning process is running.
240
  ParseContext *parse_context_;
241
242
  // The rest of this class are scanning state variables, which are declared or used by PostgreQL
243
  // structures, functions, and operations.
244
  location token_loc_;  // Current token location.
245
  GramProcessor::symbol_type lookahead_;  // Lookahead token.
246
  location cursor_;  // The current scanning location.
247
248
  // Literalbuf is used to accumulate literal values when multiple rules are needed to parse a
249
  // single literal.  Call startlit() to reset buffer to empty, addlit() to add text.
250
  // NOTE: The string in literalbuf is NOT necessarily null-terminated, but there always IS room to
251
  // add a trailing null at offset literallen.  We store a null only when we need it.
252
  char *literalbuf_;  // Temporary buffer for literal.
253
  size_t literallen_;  // Temporary buffer length.
254
  size_t literalalloc_;  // Temporary buffer size.
255
  int xcdepth_;  // Depth of nesting in slash-star comments.
256
  char *dolqstart_;  // Current $foo$ quote start string.
257
  int32_t utf16_first_part_;  // First of UTF16 surrogate unicode escape pair.
258
  bool warn_on_first_escape_;  // Literal-lexing warning for escape.
259
  bool saw_non_ascii_;  // Literal-lexing warning for non ascii.
260
261
  // Scanner settings to use.  These are initialized from the corresponding GUC variables by
262
  // scanner_init().  Callers can modify them after scanner_init() if they don't want the scanner's
263
  // behavior to follow the prevailing GUC settings.
264
  BackslashQuoteType backslash_quote_;  // State when scaning backslash.
265
  bool escape_string_warning_;  // State when scaning escape.
266
  bool standard_conforming_strings_;  // State when scaning standard string.
267
};
268
269
}  // namespace ql
270
}  // namespace yb
271
272
#endif  // YB_YQL_CQL_QL_PARSER_SCANNER_H_