/Users/deen/code/yugabyte-db/src/yb/gutil/strings/split.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2008 and onwards Google Inc. All rights reserved. |
2 | | // |
3 | | // The following only applies to changes made to this file as part of YugaByte development. |
4 | | // |
5 | | // Portions Copyright (c) YugaByte, Inc. |
6 | | // |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
8 | | // in compliance with the License. You may obtain a copy of the License at |
9 | | // |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // |
12 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
13 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
14 | | // or implied. See the License for the specific language governing permissions and limitations |
15 | | // under the License. |
16 | | // |
17 | | // Maintainer: Greg Miller <jgm@google.com> |
18 | | |
19 | | #include "yb/gutil/strings/split.h" |
20 | | |
21 | | #include <assert.h> |
22 | | #include <stdlib.h> |
23 | | #include <string.h> |
24 | | #include <iterator> |
25 | | using std::back_insert_iterator; |
26 | | using std::iterator_traits; |
27 | | #include <limits> |
28 | | using std::numeric_limits; |
29 | | |
30 | | #include "yb/gutil/integral_types.h" |
31 | | #include <glog/logging.h> |
32 | | #include "yb/gutil/logging-inl.h" |
33 | | #include "yb/gutil/macros.h" |
34 | | #include "yb/gutil/strtoint.h" |
35 | | #include "yb/gutil/strings/ascii_ctype.h" |
36 | | #include "yb/gutil/strings/util.h" |
37 | | #include "yb/gutil/hash/hash.h" |
38 | | |
39 | | // Implementations for some of the Split2 API. Much of the Split2 API is |
40 | | // templated so it exists in header files, either strings/split.h or |
41 | | // strings/split_iternal.h. |
42 | | namespace strings { |
43 | | namespace delimiter { |
44 | | |
45 | | namespace { |
46 | | |
47 | | // This GenericFind() template function encapsulates the finding algorithm |
48 | | // shared between the Literal and AnyOf delimiters. The FindPolicy template |
49 | | // parameter allows each delimiter to customize the actual find function to use |
50 | | // and the length of the found delimiter. For example, the Literal delimiter |
51 | | // will ultimately use GStringPiece::find(), and the AnyOf delimiter will use |
52 | | // GStringPiece::find_first_of(). |
53 | | template <typename FindPolicy> |
54 | | GStringPiece GenericFind( |
55 | | GStringPiece text, |
56 | | GStringPiece delimiter, |
57 | 825k | FindPolicy find_policy) { |
58 | 825k | if (delimiter.empty() && text.length() > 0) { |
59 | | // Special case for empty string delimiters: always return a zero-length |
60 | | // GStringPiece referring to the item at position 1. |
61 | 0 | return GStringPiece(text.begin() + 1, 0); |
62 | 0 | } |
63 | 825k | auto found_pos = GStringPiece::npos; |
64 | 825k | GStringPiece found(text.end(), 0); // By default, not found |
65 | 825k | found_pos = find_policy.Find(text, delimiter); |
66 | 825k | if (found_pos != GStringPiece::npos) { |
67 | 468k | found.set(text.data() + found_pos, find_policy.Length(delimiter)); |
68 | 468k | } |
69 | 825k | return found; |
70 | 825k | } split.cc:_ZN7strings9delimiter12_GLOBAL__N_111GenericFindINS1_13LiteralPolicyEEE12GStringPieceS4_S4_T_ Line | Count | Source | 57 | 825k | FindPolicy find_policy) { | 58 | 825k | if (delimiter.empty() && text.length() > 0) { | 59 | | // Special case for empty string delimiters: always return a zero-length | 60 | | // GStringPiece referring to the item at position 1. | 61 | 0 | return GStringPiece(text.begin() + 1, 0); | 62 | 0 | } | 63 | 825k | auto found_pos = GStringPiece::npos; | 64 | 825k | GStringPiece found(text.end(), 0); // By default, not found | 65 | 825k | found_pos = find_policy.Find(text, delimiter); | 66 | 825k | if (found_pos != GStringPiece::npos) { | 67 | 468k | found.set(text.data() + found_pos, find_policy.Length(delimiter)); | 68 | 468k | } | 69 | 825k | return found; | 70 | 825k | } |
Unexecuted instantiation: split.cc:_ZN7strings9delimiter12_GLOBAL__N_111GenericFindINS1_11AnyOfPolicyEEE12GStringPieceS4_S4_T_ |
71 | | |
72 | | // Finds using GStringPiece::find(), therefore the length of the found delimiter |
73 | | // is delimiter.length(). |
74 | | struct LiteralPolicy { |
75 | 825k | size_t Find(GStringPiece text, GStringPiece delimiter) { |
76 | 825k | return text.find(delimiter); |
77 | 825k | } |
78 | 468k | size_t Length(GStringPiece delimiter) { |
79 | 468k | return delimiter.length(); |
80 | 468k | } |
81 | | }; |
82 | | |
83 | | // Finds using GStringPiece::find_first_of(), therefore the length of the found |
84 | | // delimiter is 1. |
85 | | struct AnyOfPolicy { |
86 | 0 | size_t Find(GStringPiece text, GStringPiece delimiter) { |
87 | 0 | return text.find_first_of(delimiter); |
88 | 0 | } |
89 | 0 | size_t Length(GStringPiece delimiter) { |
90 | 0 | return 1; |
91 | 0 | } |
92 | | }; |
93 | | |
94 | | } // namespace |
95 | | |
96 | | // |
97 | | // Literal |
98 | | // |
99 | | |
100 | 372k | Literal::Literal(GStringPiece sp) : delimiter_(sp.ToString()) { |
101 | 372k | } |
102 | | |
103 | 826k | GStringPiece Literal::Find(GStringPiece text) const { |
104 | 826k | return GenericFind(text, delimiter_, LiteralPolicy()); |
105 | 826k | } |
106 | | |
107 | | // |
108 | | // AnyOf |
109 | | // |
110 | | |
111 | 0 | AnyOf::AnyOf(GStringPiece sp) : delimiters_(sp.ToString()) { |
112 | 0 | } |
113 | | |
114 | 0 | GStringPiece AnyOf::Find(GStringPiece text) const { |
115 | 0 | return GenericFind(text, delimiters_, AnyOfPolicy()); |
116 | 0 | } |
117 | | |
118 | | } // namespace delimiter |
119 | | } // namespace strings |
120 | | |
121 | | // |
122 | | // ==================== LEGACY SPLIT FUNCTIONS ==================== |
123 | | // |
124 | | |
125 | | using ::strings::SkipEmpty; |
126 | | using ::strings::delimiter::AnyOf; |
127 | | using ::strings::delimiter::Limit; |
128 | | |
129 | | namespace { |
130 | | |
131 | | // Appends the results of a split to the specified container. This function has |
132 | | // the following overloads: |
133 | | // - vector<string> - for better performance |
134 | | // - map<string, string> - to change append semantics |
135 | | // - hash_map<string, string> - to change append semantics |
136 | | template <typename Container, typename Splitter> |
137 | 0 | void AppendToImpl(Container* container, Splitter splitter) { |
138 | 0 | Container c = splitter; // Calls implicit conversion operator. |
139 | 0 | std::copy(c.begin(), c.end(), std::inserter(*container, container->end())); |
140 | 0 | } Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplINSt3__13setINS1_12basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEENS1_4lessIS8_EENS6_IS8_EEEEN7strings8internal8SplitterINSD_9delimiter5AnyOfENSD_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplINSt3__16vectorI12GStringPieceNS1_9allocatorIS3_EEEEN7strings8internal8SplitterINS7_9delimiter5AnyOfENS7_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplINSt3__16vectorI12GStringPieceNS1_9allocatorIS3_EEEEN7strings8internal8SplitterINS7_9delimiter5AnyOfENS8_8NoFilterEEEEEvPT_T0_ |
141 | | |
142 | | // Overload of AppendToImpl() that is optimized for appending to vector<string>. |
143 | | // This version eliminates a couple string copies by using a vector<GStringPiece> |
144 | | // as the intermediate container. |
145 | | template <typename Splitter> |
146 | 0 | void AppendToImpl(vector<string>* container, Splitter splitter) { |
147 | 0 | vector<GStringPiece> vsp = splitter; // Calls implicit conversion operator. |
148 | 0 | size_t container_size = container->size(); |
149 | 0 | container->resize(container_size + vsp.size()); |
150 | 0 | for (const auto& sp : vsp) { |
151 | 0 | sp.CopyToString(&(*container)[container_size++]); |
152 | 0 | } |
153 | 0 | } Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplIN7strings8internal8SplitterINS1_9delimiter5AnyOfENS2_8NoFilterEEEEEvPNSt3__16vectorINS8_12basic_stringIcNS8_11char_traitsIcEENS8_9allocatorIcEEEENSD_ISF_EEEET_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplIN7strings8internal8SplitterINS1_9delimiter9LimitImplINS4_5AnyOfEEENS2_8NoFilterEEEEEvPNSt3__16vectorINSA_12basic_stringIcNSA_11char_traitsIcEENSA_9allocatorIcEEEENSF_ISH_EEEET_ |
154 | | |
155 | | // Here we define two AppendToImpl() overloads for map<> and hash_map<>. Both of |
156 | | // these overloads call through to this AppendToMap() function. This is needed |
157 | | // because inserting a duplicate key into a map does NOT overwrite the previous |
158 | | // value, which was not the behavior of the split1 Split*() functions. Consider |
159 | | // this example: |
160 | | // |
161 | | // map<string, string> m; |
162 | | // m.insert(std::make_pair("a", "1")); |
163 | | // m.insert(std::make_pair("a", "2")); // <-- doesn't actually insert. |
164 | | // ASSERT_EQ(m["a"], "1"); // <-- "a" has value "1" not "2". |
165 | | // |
166 | | // Due to this behavior of map::insert, we can't rely on a normal std::inserter |
167 | | // for a maps. Instead, maps and hash_maps need to be special cased to implement |
168 | | // the desired append semantic of inserting an existing value overwrites the |
169 | | // previous value. |
170 | | // |
171 | | // This same issue is true with sets as well. However, since sets don't have a |
172 | | // separate key and value, failing to overwrite an existing value in a set is |
173 | | // fine because the value already exists in the set. |
174 | | // |
175 | | template <typename Map, typename Splitter> |
176 | 0 | void AppendToMap(Map* m, Splitter splitter) { |
177 | 0 | Map tmp = splitter; // Calls implicit conversion operator. |
178 | 0 | for (typename Map::const_iterator it = tmp.begin(); it != tmp.end(); ++it) { |
179 | 0 | (*m)[it->first] = it->second; |
180 | 0 | } |
181 | 0 | } |
182 | | |
183 | | template <typename Splitter> |
184 | 0 | void AppendToImpl(map<string, string>* map_container, Splitter splitter) { |
185 | 0 | AppendToMap(map_container, splitter); |
186 | 0 | } |
187 | | |
188 | | // Appends the results of a call to strings::Split() to the specified container. |
189 | | // This function is used with the new strings::Split() API to implement the |
190 | | // append semantics of the legacy Split*() functions. |
191 | | // |
192 | | // The "Splitter" template parameter is intended to be a |
193 | | // ::strings::internal::Splitter<>, which is the return value of a call to |
194 | | // strings::Split(). Sample usage: |
195 | | // |
196 | | // vector<string> v; |
197 | | // ... add stuff to "v" ... |
198 | | // AppendTo(&v, strings::Split("a,b,c", ",")); |
199 | | // |
200 | | template <typename Container, typename Splitter> |
201 | 0 | void AppendTo(Container* container, Splitter splitter) { |
202 | 0 | if (container->empty()) { |
203 | | // "Appending" to an empty container is by far the common case. For this we |
204 | | // assign directly to the output container, which is more efficient than |
205 | | // explicitly appending. |
206 | 0 | *container = splitter; // Calls implicit conversion operator. |
207 | 0 | } else { |
208 | 0 | AppendToImpl(container, splitter); |
209 | 0 | } |
210 | 0 | } Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToINSt3__16vectorINS1_12basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEENS6_IS8_EEEEN7strings8internal8SplitterINSB_9delimiter5AnyOfENSC_8NoFilterEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToINSt3__16vectorINS1_12basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEENS6_IS8_EEEEN7strings8internal8SplitterINSB_9delimiter9LimitImplINSE_5AnyOfEEENSC_8NoFilterEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToINSt3__13setINS1_12basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEENS1_4lessIS8_EENS6_IS8_EEEEN7strings8internal8SplitterINSD_9delimiter5AnyOfENSD_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToINSt3__13mapINS1_12basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEES8_NS1_4lessIS8_EENS6_INS1_4pairIKS8_S8_EEEEEEN7strings8internal8SplitterINSG_9delimiter5AnyOfENSG_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToINSt3__16vectorI12GStringPieceNS1_9allocatorIS3_EEEEN7strings8internal8SplitterINS7_9delimiter5AnyOfENS7_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToINSt3__16vectorI12GStringPieceNS1_9allocatorIS3_EEEEN7strings8internal8SplitterINS7_9delimiter5AnyOfENS8_8NoFilterEEEEEvPT_T0_ |
211 | | |
212 | | } // anonymous namespace |
213 | | |
214 | | // Constants for ClipString() |
215 | | static const int kMaxOverCut = 12; |
216 | | // The ellipsis to add to strings that are too long |
217 | | static const char kCutStr[] = "..."; |
218 | | static const size_t kCutStrSize = sizeof(kCutStr) - 1; |
219 | | |
220 | | // ---------------------------------------------------------------------- |
221 | | // Return the place to clip the string at, or -1 |
222 | | // if the string doesn't need to be clipped. |
223 | | // ---------------------------------------------------------------------- |
224 | 0 | static size_t ClipStringHelper(const char* str, size_t max_len, bool use_ellipsis) { |
225 | 0 | if (strlen(str) <= max_len) |
226 | 0 | return std::numeric_limits<size_t>::max(); |
227 | | |
228 | 0 | auto max_substr_len = max_len; |
229 | |
|
230 | 0 | if (use_ellipsis && max_len > kCutStrSize) { |
231 | 0 | max_substr_len -= kCutStrSize; |
232 | 0 | } |
233 | |
|
234 | 0 | const char* cut_by = |
235 | 0 | (max_substr_len < kMaxOverCut ? str : str + max_len - kMaxOverCut); |
236 | 0 | const char* cut_at = str + max_substr_len; |
237 | 0 | while (!ascii_isspace(*cut_at) && cut_at > cut_by) |
238 | 0 | cut_at--; |
239 | |
|
240 | 0 | if (cut_at == cut_by) { |
241 | | // No space was found |
242 | 0 | return max_substr_len; |
243 | 0 | } else { |
244 | 0 | return cut_at-str; |
245 | 0 | } |
246 | 0 | } |
247 | | |
248 | | // ---------------------------------------------------------------------- |
249 | | // ClipString |
250 | | // Clip a string to a max length. We try to clip on a word boundary |
251 | | // if this is possible. If the string is clipped, we append an |
252 | | // ellipsis. |
253 | | // ---------------------------------------------------------------------- |
254 | | |
255 | 0 | void ClipString(char* str, size_t max_len) { |
256 | 0 | auto cut_at = ClipStringHelper(str, max_len, true); |
257 | 0 | if (cut_at != std::numeric_limits<size_t>::max()) { |
258 | 0 | if (max_len > kCutStrSize) { |
259 | 0 | strcpy(str+cut_at, kCutStr); // NOLINT |
260 | 0 | } else { |
261 | 0 | strcpy(str+cut_at, ""); // NOLINT |
262 | 0 | } |
263 | 0 | } |
264 | 0 | } |
265 | | |
266 | | // ---------------------------------------------------------------------- |
267 | | // ClipString |
268 | | // Version of ClipString() that uses string instead of char*. |
269 | | // ---------------------------------------------------------------------- |
270 | 0 | void ClipString(string* full_str, size_t max_len) { |
271 | 0 | auto cut_at = ClipStringHelper(full_str->c_str(), max_len, true); |
272 | 0 | if (cut_at != std::numeric_limits<size_t>::max()) { |
273 | 0 | full_str->erase(cut_at); |
274 | 0 | if (max_len > kCutStrSize) { |
275 | 0 | full_str->append(kCutStr); |
276 | 0 | } |
277 | 0 | } |
278 | 0 | } |
279 | | |
280 | | // ---------------------------------------------------------------------- |
281 | | // SplitStringToIteratorAllowEmpty() |
282 | | // Split a string using a character delimiter. Append the components |
283 | | // to 'result'. If there are consecutive delimiters, this function |
284 | | // will return corresponding empty strings. The string is split into |
285 | | // at most the specified number of pieces greedily. This means that the |
286 | | // last piece may possibly be split further. To split into as many pieces |
287 | | // as possible, specify 0 as the number of pieces. |
288 | | // |
289 | | // If "full" is the empty string, yields an empty string as the only value. |
290 | | // |
291 | | // If "pieces" is negative for some reason, it returns the whole string |
292 | | // ---------------------------------------------------------------------- |
293 | | template <typename StringType, typename ITR> |
294 | | static inline |
295 | | void SplitStringToIteratorAllowEmpty(const StringType& full, |
296 | | const char* delim, |
297 | | size_t pieces, |
298 | | ITR& result) { // NOLINT |
299 | | string::size_type begin_index, end_index; |
300 | | begin_index = 0; |
301 | | |
302 | | for (size_t i = 0; (i < pieces-1) || (pieces == 0); i++) { |
303 | | end_index = full.find_first_of(delim, begin_index); |
304 | | if (end_index == string::npos) { |
305 | | *result++ = full.substr(begin_index); |
306 | | return; |
307 | | } |
308 | | *result++ = full.substr(begin_index, (end_index - begin_index)); |
309 | | begin_index = end_index + 1; |
310 | | } |
311 | | *result++ = full.substr(begin_index); |
312 | | } |
313 | | |
314 | | void SplitStringIntoNPiecesAllowEmpty(const string& full, |
315 | | const char* delim, |
316 | | size_t pieces, |
317 | 0 | vector<string>* result) { |
318 | 0 | if (pieces == 0) { |
319 | | // No limit when pieces is 0. |
320 | 0 | AppendTo(result, strings::Split(full, AnyOf(delim))); |
321 | 0 | } else { |
322 | | // The input argument "pieces" specifies the max size that *result should |
323 | | // be. However, the argument to the Limit() delimiter is the max number of |
324 | | // delimiters, which should be one less than "pieces". Example: "a,b,c" has |
325 | | // 3 pieces and two comma delimiters. |
326 | 0 | auto limit = std::max<size_t>(pieces - 1, 0); |
327 | 0 | AppendTo(result, strings::Split(full, Limit(AnyOf(delim), limit))); |
328 | 0 | } |
329 | 0 | } |
330 | | |
331 | | // ---------------------------------------------------------------------- |
332 | | // SplitStringAllowEmpty |
333 | | // Split a string using a character delimiter. Append the components |
334 | | // to 'result'. If there are consecutive delimiters, this function |
335 | | // will return corresponding empty strings. |
336 | | // ---------------------------------------------------------------------- |
337 | | void SplitStringAllowEmpty(const string& full, const char* delim, |
338 | 0 | vector<string>* result) { |
339 | 0 | AppendTo(result, strings::Split(full, AnyOf(delim))); |
340 | 0 | } |
341 | | |
342 | | // If we know how much to allocate for a vector of strings, we can |
343 | | // allocate the vector<string> only once and directly to the right size. |
344 | | // This saves in between 33-66 % of memory space needed for the result, |
345 | | // and runs faster in the microbenchmarks. |
346 | | // |
347 | | // The reserve is only implemented for the single character delim. |
348 | | // |
349 | | // The implementation for counting is cut-and-pasted from |
350 | | // SplitStringToIteratorUsing. I could have written my own counting iterator, |
351 | | // and use the existing template function, but probably this is more clear |
352 | | // and more sure to get optimized to reasonable code. |
353 | 170 | static size_t CalculateReserveForVector(const string& full, const char* delim) { |
354 | 170 | size_t count = 0; |
355 | 170 | if (delim[0] != '\0' && delim[1] == '\0') { |
356 | | // Optimize the common case where delim is a single character. |
357 | 170 | char c = delim[0]; |
358 | 170 | const char* p = full.data(); |
359 | 170 | const char* end = p + full.size(); |
360 | 340 | while (p != end) { |
361 | 170 | if (*p == c) { // This could be optimized with hasless(v,1) trick. |
362 | 0 | ++p; |
363 | 170 | } else { |
364 | 7.41k | while (++p != end && *p != c) { |
365 | | // Skip to the next occurence of the delimiter. |
366 | 7.24k | } |
367 | 170 | ++count; |
368 | 170 | } |
369 | 170 | } |
370 | 170 | } |
371 | 170 | return count; |
372 | 170 | } |
373 | | |
374 | | // ---------------------------------------------------------------------- |
375 | | // SplitStringUsing() |
376 | | // SplitStringToHashsetUsing() |
377 | | // SplitStringToSetUsing() |
378 | | // SplitStringToMapUsing() |
379 | | // SplitStringToHashmapUsing() |
380 | | // Split a string using a character delimiter. Append the components |
381 | | // to 'result'. |
382 | | // |
383 | | // Note: For multi-character delimiters, this routine will split on *ANY* of |
384 | | // the characters in the string, not the entire string as a single delimiter. |
385 | | // ---------------------------------------------------------------------- |
386 | | template <typename StringType, typename ITR> |
387 | | static inline |
388 | | void SplitStringToIteratorUsing(const StringType& full, |
389 | | const char* delim, |
390 | 170 | ITR& result) { // NOLINT |
391 | | // Optimize the common case where delim is a single character. |
392 | 170 | if (delim[0] != '\0' && delim[1] == '\0') { |
393 | 170 | char c = delim[0]; |
394 | 170 | const char* p = full.data(); |
395 | 170 | const char* end = p + full.size(); |
396 | 340 | while (p != end) { |
397 | 170 | if (*p == c) { |
398 | 0 | ++p; |
399 | 170 | } else { |
400 | 170 | const char* start = p; |
401 | 7.41k | while (++p != end && *p != c) { |
402 | | // Skip to the next occurence of the delimiter. |
403 | 7.24k | } |
404 | 170 | *result++ = StringType(start, p - start); |
405 | 170 | } |
406 | 170 | } |
407 | 170 | return; |
408 | 170 | } |
409 | | |
410 | 0 | string::size_type begin_index, end_index; |
411 | 0 | begin_index = full.find_first_not_of(delim); |
412 | 0 | while (begin_index != string::npos) { |
413 | 0 | end_index = full.find_first_of(delim, begin_index); |
414 | 0 | if (end_index == string::npos) { |
415 | 0 | *result++ = full.substr(begin_index); |
416 | 0 | return; |
417 | 0 | } |
418 | 0 | *result++ = full.substr(begin_index, (end_index - begin_index)); |
419 | 0 | begin_index = full.find_first_not_of(delim, end_index); |
420 | 0 | } |
421 | 0 | } |
422 | | |
423 | | void SplitStringUsing(const string& full, |
424 | | const char* delim, |
425 | 170 | vector<string>* result) { |
426 | 170 | result->reserve(result->size() + CalculateReserveForVector(full, delim)); |
427 | 170 | std::back_insert_iterator< vector<string> > it(*result); |
428 | 170 | SplitStringToIteratorUsing(full, delim, it); |
429 | 170 | } |
430 | | |
431 | | void SplitStringToSetUsing(const string& full, const char* delim, |
432 | 0 | set<string>* result) { |
433 | 0 | AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); |
434 | 0 | } |
435 | | |
436 | | void SplitStringToMapUsing(const string& full, const char* delim, |
437 | 0 | map<string, string>* result) { |
438 | 0 | AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); |
439 | 0 | } |
440 | | |
441 | | // ---------------------------------------------------------------------- |
442 | | // SplitGStringPieceToVector() |
443 | | // Split a GStringPiece into sub-GStringPieces based on delim |
444 | | // and appends the pieces to 'vec'. |
445 | | // If omit empty strings is true, empty strings are omitted |
446 | | // from the resulting vector. |
447 | | // ---------------------------------------------------------------------- |
448 | | void SplitGStringPieceToVector(const GStringPiece& full, |
449 | | const char* delim, |
450 | | vector<GStringPiece>* vec, |
451 | 0 | bool omit_empty_strings) { |
452 | 0 | if (omit_empty_strings) { |
453 | 0 | AppendTo(vec, strings::Split(full, AnyOf(delim), SkipEmpty())); |
454 | 0 | } else { |
455 | 0 | AppendTo(vec, strings::Split(full, AnyOf(delim))); |
456 | 0 | } |
457 | 0 | } |
458 | | |
459 | | // ---------------------------------------------------------------------- |
460 | | // SplitUsing() |
461 | | // Split a string using a string of delimiters, returning vector |
462 | | // of strings. The original string is modified to insert nulls. |
463 | | // ---------------------------------------------------------------------- |
464 | | |
465 | 0 | vector<char*>* SplitUsing(char* full, const char* delim) { |
466 | 0 | auto vec = new vector<char*>; |
467 | 0 | SplitToVector(full, delim, vec, true); // Omit empty strings |
468 | 0 | return vec; |
469 | 0 | } |
470 | | |
471 | | void SplitToVector(char* full, const char* delim, vector<char*>* vec, |
472 | 0 | bool omit_empty_strings) { |
473 | 0 | char* next = full; |
474 | 0 | while ((next = gstrsep(&full, delim)) != nullptr) { |
475 | 0 | if (omit_empty_strings && next[0] == '\0') continue; |
476 | 0 | vec->push_back(next); |
477 | 0 | } |
478 | | // Add last element (or full string if no delimeter found): |
479 | 0 | if (full != nullptr) { |
480 | 0 | vec->push_back(full); |
481 | 0 | } |
482 | 0 | } |
483 | | |
484 | | void SplitToVector(char* full, const char* delim, vector<const char*>* vec, |
485 | 0 | bool omit_empty_strings) { |
486 | 0 | char* next = full; |
487 | 0 | while ((next = gstrsep(&full, delim)) != nullptr) { |
488 | 0 | if (omit_empty_strings && next[0] == '\0') continue; |
489 | 0 | vec->push_back(next); |
490 | 0 | } |
491 | | // Add last element (or full string if no delimeter found): |
492 | 0 | if (full != nullptr) { |
493 | 0 | vec->push_back(full); |
494 | 0 | } |
495 | 0 | } |
496 | | |
497 | | // ---------------------------------------------------------------------- |
498 | | // SplitOneStringToken() |
499 | | // Mainly a stringified wrapper around strpbrk() |
500 | | // ---------------------------------------------------------------------- |
501 | 0 | string SplitOneStringToken(const char** source, const char* delim) { |
502 | 0 | assert(source); |
503 | 0 | assert(delim); |
504 | 0 | if (!*source) { |
505 | 0 | return string(); |
506 | 0 | } |
507 | 0 | const char * begin = *source; |
508 | | // Optimize the common case where delim is a single character. |
509 | 0 | if (delim[0] != '\0' && delim[1] == '\0') { |
510 | 0 | *source = strchr(*source, delim[0]); |
511 | 0 | } else { |
512 | 0 | *source = strpbrk(*source, delim); |
513 | 0 | } |
514 | 0 | if (*source) { |
515 | 0 | return string(begin, (*source)++); |
516 | 0 | } else { |
517 | 0 | return string(begin); |
518 | 0 | } |
519 | 0 | } |
520 | | |
521 | | // ---------------------------------------------------------------------- |
522 | | // SplitStringWithEscaping() |
523 | | // SplitStringWithEscapingAllowEmpty() |
524 | | // SplitStringWithEscapingToSet() |
525 | | // SplitStringWithWithEscapingToHashset() |
526 | | // Split the string using the specified delimiters, taking escaping into |
527 | | // account. '\' is not allowed as a delimiter. |
528 | | // ---------------------------------------------------------------------- |
529 | | template <typename ITR> |
530 | | static inline |
531 | | void SplitStringWithEscapingToIterator(const string& src, |
532 | | const strings::CharSet& delimiters, |
533 | | const bool allow_empty, |
534 | 0 | ITR* result) { |
535 | 0 | CHECK(!delimiters.Test('\\')) << "\\ is not allowed as a delimiter."; |
536 | 0 | CHECK(result); |
537 | 0 | string part; |
538 | |
|
539 | 0 | for (uint32 i = 0; i < src.size(); ++i) { |
540 | 0 | char current_char = src[i]; |
541 | 0 | if (delimiters.Test(current_char)) { |
542 | | // Push substrings when we encounter delimiters. |
543 | 0 | if (allow_empty || !part.empty()) { |
544 | 0 | *(*result)++ = part; |
545 | 0 | part.clear(); |
546 | 0 | } |
547 | 0 | } else if (current_char == '\\' && ++i < src.size()) { |
548 | | // If we see a backslash, the next delimiter or backslash is literal. |
549 | 0 | current_char = src[i]; |
550 | 0 | if (current_char != '\\' && !delimiters.Test(current_char)) { |
551 | | // Don't honour unknown escape sequences: emit \f for \f. |
552 | 0 | part.push_back('\\'); |
553 | 0 | } |
554 | 0 | part.push_back(current_char); |
555 | 0 | } else { |
556 | | // Otherwise, we have a normal character or trailing backslash. |
557 | 0 | part.push_back(current_char); |
558 | 0 | } |
559 | 0 | } |
560 | | |
561 | | // Push the trailing part. |
562 | 0 | if (allow_empty || !part.empty()) { |
563 | 0 | *(*result)++ = part; |
564 | 0 | } |
565 | 0 | } Unexecuted instantiation: split.cc:_ZL33SplitStringWithEscapingToIteratorINSt3__120back_insert_iteratorINS0_6vectorINS0_12basic_stringIcNS0_11char_traitsIcEENS0_9allocatorIcEEEENS6_IS8_EEEEEEEvRKS8_RKN7strings7CharSetEbPT_ Unexecuted instantiation: split.cc:_ZL33SplitStringWithEscapingToIteratorINSt3__115insert_iteratorINS0_3setINS0_12basic_stringIcNS0_11char_traitsIcEENS0_9allocatorIcEEEENS0_4lessIS8_EENS6_IS8_EEEEEEEvRKS8_RKN7strings7CharSetEbPT_ |
566 | | |
567 | | void SplitStringWithEscaping(const string &full, |
568 | | const strings::CharSet& delimiters, |
569 | 0 | vector<string> *result) { |
570 | 0 | std::back_insert_iterator< vector<string> > it(*result); |
571 | 0 | SplitStringWithEscapingToIterator(full, delimiters, false, &it); |
572 | 0 | } |
573 | | |
574 | | void SplitStringWithEscapingAllowEmpty(const string &full, |
575 | | const strings::CharSet& delimiters, |
576 | 0 | vector<string> *result) { |
577 | 0 | std::back_insert_iterator< vector<string> > it(*result); |
578 | 0 | SplitStringWithEscapingToIterator(full, delimiters, true, &it); |
579 | 0 | } |
580 | | |
581 | | void SplitStringWithEscapingToSet(const string &full, |
582 | | const strings::CharSet& delimiters, |
583 | 0 | set<string> *result) { |
584 | 0 | std::insert_iterator< set<string> > it(*result, result->end()); |
585 | 0 | SplitStringWithEscapingToIterator(full, delimiters, false, &it); |
586 | 0 | } |
587 | | |
588 | | // ---------------------------------------------------------------------- |
589 | | // SplitOneIntToken() |
590 | | // SplitOneInt32Token() |
591 | | // SplitOneUint32Token() |
592 | | // SplitOneInt64Token() |
593 | | // SplitOneUint64Token() |
594 | | // SplitOneDoubleToken() |
595 | | // SplitOneFloatToken() |
596 | | // SplitOneDecimalIntToken() |
597 | | // SplitOneDecimalInt32Token() |
598 | | // SplitOneDecimalUint32Token() |
599 | | // SplitOneDecimalInt64Token() |
600 | | // SplitOneDecimalUint64Token() |
601 | | // SplitOneHexUint32Token() |
602 | | // SplitOneHexUint64Token() |
603 | | // Mainly a stringified wrapper around strtol/strtoul/strtod |
604 | | // ---------------------------------------------------------------------- |
605 | | // Curried functions for the macro below |
606 | 0 | static inline int32_t strto32_0(const char* source, char** end) { |
607 | 0 | return strto32(source, end, 0); } |
608 | 0 | static inline uint32_t strtou32_0(const char* source, char** end) { |
609 | 0 | return strtou32(source, end, 0); } |
610 | 0 | static inline int64 strto64_0(const char* source, char** end) { |
611 | 0 | return strto64(source, end, 0); } |
612 | 0 | static inline uint64 strtou64_0(const char* source, char** end) { |
613 | 0 | return strtou64(source, end, 0); } |
614 | 0 | static inline int32_t strto32_10(const char* source, char** end) { |
615 | 0 | return strto32(source, end, 10); } |
616 | 0 | static inline uint32_t strtou32_10(const char* source, char** end) { |
617 | 0 | return strtou32(source, end, 10); } |
618 | 0 | static inline int64 strto64_10(const char* source, char** end) { |
619 | 0 | return strto64(source, end, 10); } |
620 | 0 | static inline uint64 strtou64_10(const char* source, char** end) { |
621 | 0 | return strtou64(source, end, 10); } |
622 | 0 | static inline uint32 strtou32_16(const char* source, char** end) { |
623 | 0 | return strtou32(source, end, 16); } |
624 | 0 | static inline uint64 strtou64_16(const char* source, char** end) { |
625 | 0 | return strtou64(source, end, 16); } |
626 | | |
627 | | #define DEFINE_SPLIT_ONE_NUMBER_TOKEN(name, type, function) \ |
628 | | bool SplitOne##name##Token(const char ** source, const char * delim, \ |
629 | 0 | type * value) { \ |
630 | 0 | assert(source); \ |
631 | 0 | assert(delim); \ |
632 | 0 | assert(value); \ |
633 | 0 | if (!*source) { \ |
634 | 0 | return false; \ |
635 | 0 | } \ |
636 | 0 | /* Parse int */ \ |
637 | 0 | char * end; \ |
638 | 0 | *value = function(*source, &end); \ |
639 | 0 | if (end == *source) \ |
640 | 0 | return false; /* number not present at start of string */ \ |
641 | 0 | if (end[0] && !strchr(delim, end[0])) { \ |
642 | 0 | return false; /* Garbage characters after int */ \ |
643 | 0 | } \ |
644 | 0 | /* Advance past token */ \ |
645 | 0 | if (*end != '\0') \ |
646 | 0 | *source = const_cast<const char *>(end+1); \ |
647 | 0 | else \ |
648 | 0 | *source = NULL; \ |
649 | 0 | return true; \ |
650 | 0 | } Unexecuted instantiation: _Z16SplitOneIntTokenPPKcS0_Pi Unexecuted instantiation: _Z18SplitOneInt32TokenPPKcS0_Pi Unexecuted instantiation: _Z19SplitOneUint32TokenPPKcS0_Pj Unexecuted instantiation: _Z18SplitOneInt64TokenPPKcS0_Px Unexecuted instantiation: _Z19SplitOneUint64TokenPPKcS0_Py Unexecuted instantiation: _Z19SplitOneDoubleTokenPPKcS0_Pd Unexecuted instantiation: _Z18SplitOneFloatTokenPPKcS0_Pf Unexecuted instantiation: _Z23SplitOneDecimalIntTokenPPKcS0_Pi Unexecuted instantiation: _Z25SplitOneDecimalInt32TokenPPKcS0_Pi Unexecuted instantiation: _Z26SplitOneDecimalUint32TokenPPKcS0_Pj Unexecuted instantiation: _Z25SplitOneDecimalInt64TokenPPKcS0_Px Unexecuted instantiation: _Z26SplitOneDecimalUint64TokenPPKcS0_Py Unexecuted instantiation: _Z22SplitOneHexUint32TokenPPKcS0_Pj Unexecuted instantiation: _Z22SplitOneHexUint64TokenPPKcS0_Py |
651 | | |
652 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int, int, strto32_0) |
653 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int32, int32, strto32_0) |
654 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint32, uint32, strtou32_0) |
655 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int64, int64, strto64_0) |
656 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint64, uint64, strtou64_0) |
657 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Double, double, strtod) |
658 | | #ifdef _MSC_VER // has no strtof() |
659 | | // Note: does an implicit cast to float. |
660 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtod) |
661 | | #else |
662 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtof) |
663 | | #endif |
664 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt, int, strto32_10) |
665 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt32, int32, strto32_10) |
666 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint32, uint32, strtou32_10) |
667 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt64, int64, strto64_10) |
668 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint64, uint64, strtou64_10) |
669 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint32, uint32, strtou32_16) |
670 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint64, uint64, strtou64_16) |
671 | | |
672 | | |
673 | | // ---------------------------------------------------------------------- |
674 | | // SplitRange() |
675 | | // Splits a string of the form "<from>-<to>". Either or both can be |
676 | | // missing. A raw number (<to>) is interpreted as "<to>-". Modifies |
677 | | // parameters insofar as they're specified by the string. RETURNS |
678 | | // true iff the input is a well-formed range. If it RETURNS false, |
679 | | // from and to remain unchanged. The range in rangestr should be |
680 | | // terminated either by "\0" or by whitespace. |
681 | | // ---------------------------------------------------------------------- |
682 | | |
683 | 0 | #define EOS(ch) ( (ch) == '\0' || ascii_isspace(ch) ) |
684 | 0 | bool SplitRange(const char* rangestr, int* from, int* to) { |
685 | | // We need to do the const-cast because strol takes a char**, not const char** |
686 | 0 | char* val = const_cast<char*>(rangestr); |
687 | 0 | if (val == nullptr || EOS(*val)) return true; // we'll say nothingness is ok |
688 | | |
689 | 0 | if ( val[0] == '-' && EOS(val[1]) ) // CASE 1: - |
690 | 0 | return true; // nothing changes |
691 | | |
692 | 0 | if ( val[0] == '-' ) { // CASE 2: -<i2> |
693 | 0 | const int int2 = strto32(val+1, &val, 10); |
694 | 0 | if ( !EOS(*val) ) return false; // not a valid integer |
695 | 0 | *to = int2; // only "to" changes |
696 | 0 | return true; |
697 | |
|
698 | 0 | } else { |
699 | 0 | const int int1 = strto32(val, &val, 10); |
700 | 0 | if ( EOS(*val) || (*val == '-' && EOS(*(val+1))) ) { |
701 | 0 | *from = int1; // CASE 3: <i1>, same as <i1>- |
702 | 0 | return true; // only "from" changes |
703 | 0 | } else if (*val != '-') { // not a valid range |
704 | 0 | return false; |
705 | 0 | } |
706 | 0 | const int int2 = strto32(val+1, &val, 10); |
707 | 0 | if ( !EOS(*val) ) return false; // not a valid integer |
708 | 0 | *from = int1; // CASE 4: <i1>-<i2> |
709 | 0 | *to = int2; |
710 | 0 | return true; |
711 | 0 | } |
712 | 0 | } |
713 | | |
714 | | void SplitCSVLineWithDelimiter(char* line, char delimiter, |
715 | 0 | vector<char*>* cols) { |
716 | 0 | char* end_of_line = line + strlen(line); |
717 | 0 | char* end; |
718 | 0 | char* start; |
719 | |
|
720 | 0 | for (; line < end_of_line; line++) { |
721 | | // Skip leading whitespace, unless said whitespace is the delimiter. |
722 | 0 | while (ascii_isspace(*line) && *line != delimiter) |
723 | 0 | ++line; |
724 | |
|
725 | 0 | if (*line == '"' && delimiter == ',') { // Quoted value... |
726 | 0 | start = ++line; |
727 | 0 | end = start; |
728 | 0 | for (; *line; line++) { |
729 | 0 | if (*line == '"') { |
730 | 0 | line++; |
731 | 0 | if (*line != '"') // [""] is an escaped ["] |
732 | 0 | break; // but just ["] is end of value |
733 | 0 | } |
734 | 0 | *end++ = *line; |
735 | 0 | } |
736 | | // All characters after the closing quote and before the comma |
737 | | // are ignored. |
738 | 0 | line = strchr(line, delimiter); |
739 | 0 | if (!line) line = end_of_line; |
740 | 0 | } else { |
741 | 0 | start = line; |
742 | 0 | line = strchr(line, delimiter); |
743 | 0 | if (!line) line = end_of_line; |
744 | | // Skip all trailing whitespace, unless said whitespace is the delimiter. |
745 | 0 | for (end = line; end > start; --end) { |
746 | 0 | if (!ascii_isspace(end[-1]) || end[-1] == delimiter) |
747 | 0 | break; |
748 | 0 | } |
749 | 0 | } |
750 | 0 | const bool need_another_column = |
751 | 0 | (*line == delimiter) && (line == end_of_line - 1); |
752 | 0 | *end = '\0'; |
753 | 0 | cols->push_back(start); |
754 | | // If line was something like [paul,] (comma is the last character |
755 | | // and is not proceeded by whitespace or quote) then we are about |
756 | | // to eliminate the last column (which is empty). This would be |
757 | | // incorrect. |
758 | 0 | if (need_another_column) |
759 | 0 | cols->push_back(end); |
760 | |
|
761 | 0 | assert(*line == '\0' || *line == delimiter); |
762 | 0 | } |
763 | 0 | } |
764 | | |
765 | 0 | void SplitCSVLine(char* line, vector<char*>* cols) { |
766 | 0 | SplitCSVLineWithDelimiter(line, ',', cols); |
767 | 0 | } |
768 | | |
769 | | void SplitCSVLineWithDelimiterForStrings(const string &line, |
770 | | char delimiter, |
771 | 0 | vector<string> *cols) { |
772 | | // Unfortunately, the interface requires char* instead of const char* |
773 | | // which requires copying the string. |
774 | 0 | char *cline = strndup_with_new(line.c_str(), line.size()); |
775 | 0 | vector<char *> v; |
776 | 0 | SplitCSVLineWithDelimiter(cline, delimiter, &v); |
777 | 0 | for (vector<char*>::const_iterator ci = v.begin(); ci != v.end(); ++ci) { |
778 | 0 | cols->push_back(*ci); |
779 | 0 | } |
780 | 0 | delete[] cline; |
781 | 0 | } |
782 | | |
783 | | // ---------------------------------------------------------------------- |
784 | | namespace { |
785 | | |
786 | | // Helper class used by SplitStructuredLineInternal. |
787 | | class ClosingSymbolLookup { |
788 | | public: |
789 | | explicit ClosingSymbolLookup(const char* symbol_pairs) |
790 | | : closing_(), |
791 | 0 | valid_closing_() { |
792 | | // Initialize the opening/closing arrays. |
793 | 0 | for (const char* symbol = symbol_pairs; *symbol != 0; ++symbol) { |
794 | 0 | unsigned char opening = *symbol; |
795 | 0 | ++symbol; |
796 | | // If the string ends before the closing character has been found, |
797 | | // use the opening character as the closing character. |
798 | 0 | unsigned char closing = *symbol != 0 ? *symbol : opening; |
799 | 0 | closing_[opening] = closing; |
800 | 0 | valid_closing_[closing] = true; |
801 | 0 | if (*symbol == 0) break; |
802 | 0 | } |
803 | 0 | } |
804 | | |
805 | | // Returns the closing character corresponding to an opening one, |
806 | | // or 0 if the argument is not an opening character. |
807 | 0 | char GetClosingChar(char opening) const { |
808 | 0 | return closing_[static_cast<unsigned char>(opening)]; |
809 | 0 | } |
810 | | |
811 | | // Returns true if the argument is a closing character. |
812 | 0 | bool IsClosing(char c) const { |
813 | 0 | return valid_closing_[static_cast<unsigned char>(c)]; |
814 | 0 | } |
815 | | |
816 | | private: |
817 | | // Maps an opening character to its closing. If the entry contains 0, |
818 | | // the character is not in the opening set. |
819 | | char closing_[256]; |
820 | | // Valid closing characters. |
821 | | bool valid_closing_[256]; |
822 | | |
823 | | DISALLOW_COPY_AND_ASSIGN(ClosingSymbolLookup); |
824 | | }; |
825 | | |
826 | | char* SplitStructuredLineInternal(char* line, |
827 | | char delimiter, |
828 | | const char* symbol_pairs, |
829 | | vector<char*>* cols, |
830 | 0 | bool with_escapes) { |
831 | 0 | ClosingSymbolLookup lookup(symbol_pairs); |
832 | | |
833 | | // Stack of symbols expected to close the current opened expressions. |
834 | 0 | vector<char> expected_to_close; |
835 | 0 | bool in_escape = false; |
836 | |
|
837 | 0 | CHECK(cols); |
838 | 0 | cols->push_back(line); |
839 | 0 | char* current; |
840 | 0 | for (current = line; *current; ++current) { |
841 | 0 | char c = *current; |
842 | 0 | if (in_escape) { |
843 | 0 | in_escape = false; |
844 | 0 | } else if (with_escapes && c == '\\') { |
845 | | // We are escaping the next character. Note the escape still appears |
846 | | // in the output. |
847 | 0 | in_escape = true; |
848 | 0 | } else if (expected_to_close.empty() && c == delimiter) { |
849 | | // We don't have any open expression, this is a valid separator. |
850 | 0 | *current = 0; |
851 | 0 | cols->push_back(current + 1); |
852 | 0 | } else if (!expected_to_close.empty() && c == expected_to_close.back()) { |
853 | | // Can we close the currently open expression? |
854 | 0 | expected_to_close.pop_back(); |
855 | 0 | } else if (lookup.GetClosingChar(c)) { |
856 | | // If this is an opening symbol, we open a new expression and push |
857 | | // the expected closing symbol on the stack. |
858 | 0 | expected_to_close.push_back(lookup.GetClosingChar(c)); |
859 | 0 | } else if (lookup.IsClosing(c)) { |
860 | | // Error: mismatched closing symbol. |
861 | 0 | return current; |
862 | 0 | } |
863 | 0 | } |
864 | 0 | if (!expected_to_close.empty()) { |
865 | 0 | return current; // Missing closing symbol(s) |
866 | 0 | } |
867 | 0 | return nullptr; // Success |
868 | 0 | } |
869 | | |
870 | | bool SplitStructuredLineInternal(GStringPiece line, |
871 | | char delimiter, |
872 | | const char* symbol_pairs, |
873 | | vector<GStringPiece>* cols, |
874 | 0 | bool with_escapes) { |
875 | 0 | ClosingSymbolLookup lookup(symbol_pairs); |
876 | | |
877 | | // Stack of symbols expected to close the current opened expressions. |
878 | 0 | vector<char> expected_to_close; |
879 | 0 | bool in_escape = false; |
880 | |
|
881 | 0 | CHECK_NOTNULL(cols); |
882 | 0 | cols->push_back(line); |
883 | 0 | for (size_t i = 0; i < line.size(); ++i) { |
884 | 0 | char c = line[i]; |
885 | 0 | if (in_escape) { |
886 | 0 | in_escape = false; |
887 | 0 | } else if (with_escapes && c == '\\') { |
888 | | // We are escaping the next character. Note the escape still appears |
889 | | // in the output. |
890 | 0 | in_escape = true; |
891 | 0 | } else if (expected_to_close.empty() && c == delimiter) { |
892 | | // We don't have any open expression, this is a valid separator. |
893 | 0 | cols->back().remove_suffix(line.size() - i); |
894 | 0 | cols->push_back(GStringPiece(line, i + 1)); |
895 | 0 | } else if (!expected_to_close.empty() && c == expected_to_close.back()) { |
896 | | // Can we close the currently open expression? |
897 | 0 | expected_to_close.pop_back(); |
898 | 0 | } else if (lookup.GetClosingChar(c)) { |
899 | | // If this is an opening symbol, we open a new expression and push |
900 | | // the expected closing symbol on the stack. |
901 | 0 | expected_to_close.push_back(lookup.GetClosingChar(c)); |
902 | 0 | } else if (lookup.IsClosing(c)) { |
903 | | // Error: mismatched closing symbol. |
904 | 0 | return false; |
905 | 0 | } |
906 | 0 | } |
907 | 0 | if (!expected_to_close.empty()) { |
908 | 0 | return false; // Missing closing symbol(s) |
909 | 0 | } |
910 | 0 | return true; // Success |
911 | 0 | } |
912 | | |
913 | | } // anonymous namespace |
914 | | |
915 | | char* SplitStructuredLine(char* line, |
916 | | char delimiter, |
917 | | const char *symbol_pairs, |
918 | 0 | vector<char*>* cols) { |
919 | 0 | return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, |
920 | 0 | false); |
921 | 0 | } |
922 | | |
923 | | bool SplitStructuredLine(GStringPiece line, |
924 | | char delimiter, |
925 | | const char* symbol_pairs, |
926 | 0 | vector<GStringPiece>* cols) { |
927 | 0 | return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, |
928 | 0 | false); |
929 | 0 | } |
930 | | |
931 | | char* SplitStructuredLineWithEscapes(char* line, |
932 | | char delimiter, |
933 | | const char *symbol_pairs, |
934 | 0 | vector<char*>* cols) { |
935 | 0 | return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, |
936 | 0 | true); |
937 | 0 | } |
938 | | |
939 | | bool SplitStructuredLineWithEscapes(GStringPiece line, |
940 | | char delimiter, |
941 | | const char* symbol_pairs, |
942 | 0 | vector<GStringPiece>* cols) { |
943 | 0 | return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, |
944 | 0 | true); |
945 | 0 | } |
946 | | |
947 | | |
948 | | // ---------------------------------------------------------------------- |
949 | | // SplitStringIntoKeyValues() |
950 | | // ---------------------------------------------------------------------- |
951 | | bool SplitStringIntoKeyValues(const string& line, |
952 | | const string& key_value_delimiters, |
953 | | const string& value_value_delimiters, |
954 | 0 | string *key, vector<string> *values) { |
955 | 0 | key->clear(); |
956 | 0 | values->clear(); |
957 | | |
958 | | // find the key string |
959 | 0 | size_t end_key_pos = line.find_first_of(key_value_delimiters); |
960 | 0 | if (end_key_pos == string::npos) { |
961 | 0 | VLOG(1) << "cannot parse key from line: " << line; |
962 | 0 | return false; // no key |
963 | 0 | } |
964 | 0 | key->assign(line, 0, end_key_pos); |
965 | | |
966 | | // find the values string |
967 | 0 | string remains(line, end_key_pos, line.size() - end_key_pos); |
968 | 0 | size_t begin_values_pos = remains.find_first_not_of(key_value_delimiters); |
969 | 0 | if (begin_values_pos == string::npos) { |
970 | 0 | VLOG(1) << "cannot parse value from line: " << line; |
971 | 0 | return false; // no value |
972 | 0 | } |
973 | 0 | string values_string(remains, |
974 | 0 | begin_values_pos, |
975 | 0 | remains.size() - begin_values_pos); |
976 | | |
977 | | // construct the values vector |
978 | 0 | if (value_value_delimiters.empty()) { // one value |
979 | 0 | values->push_back(values_string); |
980 | 0 | } else { // multiple values |
981 | 0 | SplitStringUsing(values_string, value_value_delimiters.c_str(), values); |
982 | 0 | if (values->size() < 1) { |
983 | 0 | VLOG(1) << "cannot parse value from line: " << line; |
984 | 0 | return false; // no value |
985 | 0 | } |
986 | 0 | } |
987 | 0 | return true; |
988 | 0 | } |
989 | | |
990 | | bool SplitStringIntoKeyValuePairs(const string& line, |
991 | | const string& key_value_delimiters, |
992 | | const string& key_value_pair_delimiters, |
993 | 0 | vector<pair<string, string> >* kv_pairs) { |
994 | 0 | kv_pairs->clear(); |
995 | |
|
996 | 0 | vector<string> pairs; |
997 | 0 | SplitStringUsing(line, key_value_pair_delimiters.c_str(), &pairs); |
998 | |
|
999 | 0 | bool success = true; |
1000 | 0 | for (const auto& pair : pairs) { |
1001 | 0 | string key; |
1002 | 0 | vector<string> value; |
1003 | 0 | if (!SplitStringIntoKeyValues(pair, |
1004 | 0 | key_value_delimiters, |
1005 | 0 | "", &key, &value)) { |
1006 | | // Don't return here, to allow for keys without associated |
1007 | | // values; just record that our split failed. |
1008 | 0 | success = false; |
1009 | 0 | } |
1010 | | // we expect atmost one value because we passed in an empty vsep to |
1011 | | // SplitStringIntoKeyValues |
1012 | 0 | DCHECK_LE(value.size(), 1); |
1013 | 0 | kv_pairs->push_back(make_pair(key, value.empty()? "" : value[0])); |
1014 | 0 | } |
1015 | 0 | return success; |
1016 | 0 | } |
1017 | | |
1018 | | // ---------------------------------------------------------------------- |
1019 | | // SplitLeadingDec32Values() |
1020 | | // SplitLeadingDec64Values() |
1021 | | // A simple parser for space-separated decimal int32/int64 values. |
1022 | | // Appends parsed integers to the end of the result vector, stopping |
1023 | | // at the first unparsable spot. Skips past leading and repeated |
1024 | | // whitespace (does not consume trailing whitespace), and returns |
1025 | | // a pointer beyond the last character parsed. |
1026 | | // -------------------------------------------------------------------- |
1027 | 0 | const char* SplitLeadingDec32Values(const char *str, vector<int32> *result) { |
1028 | 0 | for (;;) { |
1029 | 0 | char *end = nullptr; |
1030 | 0 | int64_t value = strtol(str, &end, 10); |
1031 | 0 | if (end == str) |
1032 | 0 | break; |
1033 | | // Limit long values to int32 min/max. Needed for lp64. |
1034 | 0 | if (value > numeric_limits<int32>::max()) { |
1035 | 0 | value = numeric_limits<int32>::max(); |
1036 | 0 | } else if (value < numeric_limits<int32>::min()) { |
1037 | 0 | value = numeric_limits<int32>::min(); |
1038 | 0 | } |
1039 | 0 | result->push_back(narrow_cast<int32>(value)); |
1040 | 0 | str = end; |
1041 | 0 | if (!ascii_isspace(*end)) |
1042 | 0 | break; |
1043 | 0 | } |
1044 | 0 | return str; |
1045 | 0 | } |
1046 | | |
1047 | 0 | const char* SplitLeadingDec64Values(const char *str, vector<int64> *result) { |
1048 | 0 | for (;;) { |
1049 | 0 | char *end = nullptr; |
1050 | 0 | const int64 value = strtoll(str, &end, 10); |
1051 | 0 | if (end == str) |
1052 | 0 | break; |
1053 | 0 | result->push_back(value); |
1054 | 0 | str = end; |
1055 | 0 | if (!ascii_isspace(*end)) |
1056 | 0 | break; |
1057 | 0 | } |
1058 | 0 | return str; |
1059 | 0 | } |
1060 | | |
1061 | | void SplitStringToLines(const char* full, |
1062 | | size_t max_len, |
1063 | | size_t num_lines, |
1064 | 0 | vector<string>* result) { |
1065 | 0 | if (max_len <= 0) { |
1066 | 0 | return; |
1067 | 0 | } |
1068 | 0 | size_t pos = 0; |
1069 | 0 | for (size_t i = 0; (i < num_lines || num_lines <= 0); i++) { |
1070 | 0 | auto cut_at = ClipStringHelper(full+pos, max_len, (i == num_lines - 1)); |
1071 | 0 | if (cut_at == std::numeric_limits<size_t>::max()) { |
1072 | 0 | result->push_back(string(full+pos)); |
1073 | 0 | return; |
1074 | 0 | } |
1075 | 0 | result->push_back(string(full+pos, cut_at)); |
1076 | 0 | if (i == num_lines - 1 && max_len > kCutStrSize) { |
1077 | 0 | result->at(i).append(kCutStr); |
1078 | 0 | } |
1079 | 0 | pos += cut_at; |
1080 | 0 | } |
1081 | 0 | } |