/Users/deen/code/yugabyte-db/src/yb/gutil/utf/rune.c

Source (jump to first uncovered line)
/*
 * The authors of this software are Rob Pike and Ken Thompson.
 *              Copyright (c) 2002 by Lucent Technologies.
 * Permission to use, copy, modify, and distribute this software for any
 * purpose without fee is hereby granted, provided that this entire notice
 * is included in all copies of any software which is or includes a copy
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
 * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
//
// The following only applies to changes made to this file as part of YugaByte development.
//
// Portions Copyright (c) YugaByte, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied.  See the License for the specific language governing permissions and limitations
// under the License.
//
#include <stdarg.h>
#include <string.h>
#include "yb/gutil/utf/utf.h"
#include "yb/gutil/utf/utfdef.h"

enum
{
  Bit1 = 7,
  Bitx = 6,
  Bit2 = 5,
  Bit3 = 4,
  Bit4 = 3,
  Bit5 = 2,

  T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
  Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
  T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
  T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
  T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
  T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */

  Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
  Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
  Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
  Rune4 = (1<<(Bit4+3*Bitx))-1,
                                        /* 0001 1111 1111 1111 1111 1111 */

  Maskx = (1<<Bitx)-1,  /* 0011 1111 */
  Testx = Maskx ^ 0xFF, /* 1100 0000 */

  Bad = Runeerror,
};

/*
 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
 * This is a slower but "safe" version of the old chartorune
 * that works on strings that are not necessarily null-terminated.
 *
 * If you know for sure that your string is null-terminated,
 * chartorune will be a bit faster.
 *
 * It is guaranteed not to attempt to access "length"
 * past the incoming pointer.  This is to avoid
 * possible access violations.  If the string appears to be
 * well-formed but incomplete (i.e., to get the whole Rune
 * we'd need to read past str+length) then we'll set the Rune
 * to Bad and return 0.
 *
 * Note that if we have decoding problems for other
 * reasons, we return 1 instead of 0.
 */
int
charntorune(Rune *rune, const char *str, int length)
{
  int c, c1, c2, c3;
  int l;

  /* When we're not allowed to read anything */
  if(length <= 0) {
    goto badlen;
  }

  /*
   * one character sequence (7-bit value)
   *  00000-0007F => T1
   */
  c = *(uchar*)str;
  if(c < Tx) {
    *rune = c;
    return 1;
  }

  // If we can't read more than one character we must stop
  if(length <= 1) {
    goto badlen;
  }

  /*
   * two character sequence (11-bit value)
   *  0080-07FF => T2 Tx
   */
  c1 = *(uchar*)(str+1) ^ Tx;
  if(c1 & Testx)
    goto bad;
  if(c < T3) {
    if(c < T2)
      goto bad;
    l = ((c << Bitx) | c1) & Rune2;
    if(l <= Rune1)
      goto bad;
    *rune = l;
    return 2;
  }

  // If we can't read more than two characters we must stop
  if(length <= 2) {
    goto badlen;
  }

  /*
   * three character sequence (16-bit value)
   *  0800-FFFF => T3 Tx Tx
   */
  c2 = *(uchar*)(str+2) ^ Tx;
  if(c2 & Testx)
    goto bad;
  if(c < T4) {
    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
    if(l <= Rune2)
      goto bad;
    *rune = l;
    return 3;
  }

  if (length <= 3)
    goto badlen;

  /*
   * four character sequence (21-bit value)
   *  10000-1FFFFF => T4 Tx Tx Tx
   */
  c3 = *(uchar*)(str+3) ^ Tx;
  if (c3 & Testx)
    goto bad;
  if (c < T5) {
    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
    if (l <= Rune3)
      goto bad;
    *rune = l;
    return 4;
  }

  // Support for 5-byte or longer UTF-8 would go here, but
  // since we don't have that, we'll just fall through to bad.

  /*
   * bad decoding
   */
bad:
  *rune = Bad;
  return 1;
badlen:
  *rune = Bad;
  return 0;

}


/*
 * This is the older "unsafe" version, which works fine on
 * null-terminated strings.
 */
int
chartorune(Rune *rune, const char *str)
{
  int c, c1, c2, c3;
  int l;

  /*
   * one character sequence
   *  00000-0007F => T1
   */
  c = *(uchar*)str;
  if(c < Tx) {
    *rune = c;
    return 1;
  }

  /*
   * two character sequence
   *  0080-07FF => T2 Tx
   */
  c1 = *(uchar*)(str+1) ^ Tx;
  if(c1 & Testx)
    goto bad;
  if(c < T3) {
    if(c < T2)
      goto bad;
    l = ((c << Bitx) | c1) & Rune2;
    if(l <= Rune1)
      goto bad;
    *rune = l;
    return 2;
  }

  /*
   * three character sequence
   *  0800-FFFF => T3 Tx Tx
   */
  c2 = *(uchar*)(str+2) ^ Tx;
  if(c2 & Testx)
    goto bad;
  if(c < T4) {
    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
    if(l <= Rune2)
      goto bad;
    *rune = l;
    return 3;
  }

  /*
   * four character sequence (21-bit value)
   *  10000-1FFFFF => T4 Tx Tx Tx
   */
  c3 = *(uchar*)(str+3) ^ Tx;
  if (c3 & Testx)
    goto bad;
  if (c < T5) {
    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
    if (l <= Rune3)
      goto bad;
    *rune = l;
    return 4;
  }

  /*
   * Support for 5-byte or longer UTF-8 would go here, but
   * since we don't have that, we'll just fall through to bad.
   */

  /*
   * bad decoding
   */
bad:
  *rune = Bad;
  return 1;
}

int
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
  *consumed = charntorune(rune, str, length);
  return *rune != Runeerror || *consumed == 3;
}

int
runetochar(char *str, const Rune *rune)
{
  /* Runes are signed, so convert to unsigned for range check. */
  unsigned long c;

  /*
   * one character sequence
   *  00000-0007F => 00-7F
   */
  c = *rune;
  if(c <= Rune1) {
    str[0] = c;
    return 1;
  }

  /*
   * two character sequence
   *  0080-07FF => T2 Tx
   */
  if(c <= Rune2) {
    str[0] = T2 | (c >> 1*Bitx);
    str[1] = Tx | (c & Maskx);
    return 2;
  }

  /*
   * If the Rune is out of range, convert it to the error rune.
   * Do this test here because the error rune encodes to three bytes.
   * Doing it earlier would duplicate work, since an out of range
   * Rune wouldn't have fit in one or two bytes.
   */
  if (c > Runemax)
    c = Runeerror;

  /*
   * three character sequence
   *  0800-FFFF => T3 Tx Tx
   */
  if (c <= Rune3) {
    str[0] = T3 |  (c >> 2*Bitx);
    str[1] = Tx | ((c >> 1*Bitx) & Maskx);
    str[2] = Tx |  (c & Maskx);
    return 3;
  }

  /*
   * four character sequence (21-bit value)
   *     10000-1FFFFF => T4 Tx Tx Tx
   */
  str[0] = T4 | (c >> 3*Bitx);
  str[1] = Tx | ((c >> 2*Bitx) & Maskx);
  str[2] = Tx | ((c >> 1*Bitx) & Maskx);
  str[3] = Tx | (c & Maskx);
  return 4;
}

int
runelen(Rune rune)
{
  char str[10];

  return runetochar(str, &rune);
}

int
runenlen(const Rune *r, int nrune)
{
  int nb, c;

  nb = 0;
  while(nrune--) {
    c = *r++;
    if (c <= Rune1)
      nb++;
    else if (c <= Rune2)
      nb += 2;
    else if (c <= Rune3)
      nb += 3;
    else /* assert(c <= Rune4) */
      nb += 4;
  }
  return nb;
}

int
fullrune(const char *str, int n)
{
  if (n > 0) {
    int c = *(uchar*)str;
    if (c < Tx)
      return 1;
    if (n > 1) {
      if (c < T3)
        return 1;
      if (n > 2) {
        if (c < T4 || n > 3)
          return 1;
      }
    }
  }
  return 0;
}

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

Line	Count	Source (jump to first uncovered line)
1		/*
2		* The authors of this software are Rob Pike and Ken Thompson.
3		* Copyright (c) 2002 by Lucent Technologies.
4		* Permission to use, copy, modify, and distribute this software for any
5		* purpose without fee is hereby granted, provided that this entire notice
6		* is included in all copies of any software which is or includes a copy
7		* or modification of this software and in all copies of the supporting
8		* documentation for such software.
9		* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10		* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11		* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12		* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13		*/
14		//
15		// The following only applies to changes made to this file as part of YugaByte development.
16		//
17		// Portions Copyright (c) YugaByte, Inc.
18		//
19		// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
20		// in compliance with the License. You may obtain a copy of the License at
21		//
22		// http://www.apache.org/licenses/LICENSE-2.0
23		//
24		// Unless required by applicable law or agreed to in writing, software distributed under the License
25		// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
26		// or implied. See the License for the specific language governing permissions and limitations
27		// under the License.
28		//
29		#include <stdarg.h>
30		#include <string.h>
31		#include "yb/gutil/utf/utf.h"
32		#include "yb/gutil/utf/utfdef.h"
33
34		enum
35		{
36		Bit1 = 7,
37		Bitx = 6,
38		Bit2 = 5,
39		Bit3 = 4,
40		Bit4 = 3,
41		Bit5 = 2,
42
43		T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
44		Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
45		T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
46		T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
47		T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
48		T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
49
50		Rune1 = (1<<(Bit1+0Bitx))-1, / 0000 0000 0111 1111 */
51		Rune2 = (1<<(Bit2+1Bitx))-1, / 0000 0111 1111 1111 */
52		Rune3 = (1<<(Bit3+2Bitx))-1, / 1111 1111 1111 1111 */
53		Rune4 = (1<<(Bit4+3*Bitx))-1,
54		/* 0001 1111 1111 1111 1111 1111 */
55
56		Maskx = (1<<Bitx)-1, /* 0011 1111 */
57		Testx = Maskx ^ 0xFF, /* 1100 0000 */
58
59		Bad = Runeerror,
60		};
61
62		/*
63		* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
64		* This is a slower but "safe" version of the old chartorune
65		* that works on strings that are not necessarily null-terminated.
66		*
67		* If you know for sure that your string is null-terminated,
68		* chartorune will be a bit faster.
69		*
70		* It is guaranteed not to attempt to access "length"
71		* past the incoming pointer. This is to avoid
72		* possible access violations. If the string appears to be
73		* well-formed but incomplete (i.e., to get the whole Rune
74		* we'd need to read past str+length) then we'll set the Rune
75		* to Bad and return 0.
76		*
77		* Note that if we have decoding problems for other
78		* reasons, we return 1 instead of 0.
79		*/
80		int
81		charntorune(Rune rune, const char str, int length)
82	271k	{
83	271k	int c, c1, c2, c3;
84	271k	int l;
85
86		/* When we're not allowed to read anything */
87	271k	if(length <= 0) {
88	0	goto badlen;
89	0	}
90
91		/*
92		* one character sequence (7-bit value)
93		* 00000-0007F => T1
94		*/
95	271k	c = (uchar)str;
96	271k	if(c < Tx) {
97	271k	*rune = c;
98	271k	return 1;
99	271k	}
100
101		// If we can't read more than one character we must stop
102	22	if(length <= 1) {
103	0	goto badlen;
104	0	}
105
106		/*
107		* two character sequence (11-bit value)
108		* 0080-07FF => T2 Tx
109		*/
110	22	c1 = (uchar)(str+1) ^ Tx;
111	22	if(c1 & Testx)
112	0	goto bad;
113	22	if(c < T3) {
114	0	if(c < T2)
115	0	goto bad;
116	0	l = ((c << Bitx) \| c1) & Rune2;
117	0	if(l <= Rune1)
118	0	goto bad;
119	0	*rune = l;
120	0	return 2;
121	0	}
122
123		// If we can't read more than two characters we must stop
124	22	if(length <= 2) {
125	0	goto badlen;
126	0	}
127
128		/*
129		* three character sequence (16-bit value)
130		* 0800-FFFF => T3 Tx Tx
131		*/
132	22	c2 = (uchar)(str+2) ^ Tx;
133	22	if(c2 & Testx)
134	0	goto bad;
135	22	if(c < T4) {
136	19	l = ((((c << Bitx) \| c1) << Bitx) \| c2) & Rune3;
137	19	if(l <= Rune2)
138	0	goto bad;
139	19	*rune = l;
140	19	return 3;
141	19	}
142
143	3	if (length <= 3)
144	0	goto badlen;
145
146		/*
147		* four character sequence (21-bit value)
148		* 10000-1FFFFF => T4 Tx Tx Tx
149		*/
150	3	c3 = (uchar)(str+3) ^ Tx;
151	3	if (c3 & Testx)
152	0	goto bad;
153	3	if (c < T5) {
154	3	l = ((((((c << Bitx) \| c1) << Bitx) \| c2) << Bitx) \| c3) & Rune4;
155	3	if (l <= Rune3)
156	0	goto bad;
157	3	*rune = l;
158	3	return 4;
159	3	}
160
161		// Support for 5-byte or longer UTF-8 would go here, but
162		// since we don't have that, we'll just fall through to bad.
163
164		/*
165		* bad decoding
166		*/
167	0	bad:
168	0	*rune = Bad;
169	0	return 1;
170	0	badlen:
171	0	*rune = Bad;
172	0	return 0;
173
174	3	}
175
176
177		/*
178		* This is the older "unsafe" version, which works fine on
179		* null-terminated strings.
180		*/
181		int
182		chartorune(Rune rune, const char str)
183	0	{
184	0	int c, c1, c2, c3;
185	0	int l;
186
187		/*
188		* one character sequence
189		* 00000-0007F => T1
190		*/
191	0	c = (uchar)str;
192	0	if(c < Tx) {
193	0	*rune = c;
194	0	return 1;
195	0	}
196
197		/*
198		* two character sequence
199		* 0080-07FF => T2 Tx
200		*/
201	0	c1 = (uchar)(str+1) ^ Tx;
202	0	if(c1 & Testx)
203	0	goto bad;
204	0	if(c < T3) {
205	0	if(c < T2)
206	0	goto bad;
207	0	l = ((c << Bitx) \| c1) & Rune2;
208	0	if(l <= Rune1)
209	0	goto bad;
210	0	*rune = l;
211	0	return 2;
212	0	}
213
214		/*
215		* three character sequence
216		* 0800-FFFF => T3 Tx Tx
217		*/
218	0	c2 = (uchar)(str+2) ^ Tx;
219	0	if(c2 & Testx)
220	0	goto bad;
221	0	if(c < T4) {
222	0	l = ((((c << Bitx) \| c1) << Bitx) \| c2) & Rune3;
223	0	if(l <= Rune2)
224	0	goto bad;
225	0	*rune = l;
226	0	return 3;
227	0	}
228
229		/*
230		* four character sequence (21-bit value)
231		* 10000-1FFFFF => T4 Tx Tx Tx
232		*/
233	0	c3 = (uchar)(str+3) ^ Tx;
234	0	if (c3 & Testx)
235	0	goto bad;
236	0	if (c < T5) {
237	0	l = ((((((c << Bitx) \| c1) << Bitx) \| c2) << Bitx) \| c3) & Rune4;
238	0	if (l <= Rune3)
239	0	goto bad;
240	0	*rune = l;
241	0	return 4;
242	0	}
243
244		/*
245		* Support for 5-byte or longer UTF-8 would go here, but
246		* since we don't have that, we'll just fall through to bad.
247		*/
248
249		/*
250		* bad decoding
251		*/
252	0	bad:
253	0	*rune = Bad;
254	0	return 1;
255	0	}
256
257		int
258	0	isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
259	0	*consumed = charntorune(rune, str, length);
260	0	return rune != Runeerror \|\| consumed == 3;
261	0	}
262
263		int
264		runetochar(char str, const Rune rune)
265	0	{
266		/* Runes are signed, so convert to unsigned for range check. */
267	0	unsigned long c;
268
269		/*
270		* one character sequence
271		* 00000-0007F => 00-7F
272		*/
273	0	c = *rune;
274	0	if(c <= Rune1) {
275	0	str[0] = c;
276	0	return 1;
277	0	}
278
279		/*
280		* two character sequence
281		* 0080-07FF => T2 Tx
282		*/
283	0	if(c <= Rune2) {
284	0	str[0] = T2 \| (c >> 1*Bitx);
285	0	str[1] = Tx \| (c & Maskx);
286	0	return 2;
287	0	}
288
289		/*
290		* If the Rune is out of range, convert it to the error rune.
291		* Do this test here because the error rune encodes to three bytes.
292		* Doing it earlier would duplicate work, since an out of range
293		* Rune wouldn't have fit in one or two bytes.
294		*/
295	0	if (c > Runemax)
296	0	c = Runeerror;
297
298		/*
299		* three character sequence
300		* 0800-FFFF => T3 Tx Tx
301		*/
302	0	if (c <= Rune3) {
303	0	str[0] = T3 \| (c >> 2*Bitx);
304	0	str[1] = Tx \| ((c >> 1*Bitx) & Maskx);
305	0	str[2] = Tx \| (c & Maskx);
306	0	return 3;
307	0	}
308
309		/*
310		* four character sequence (21-bit value)
311		* 10000-1FFFFF => T4 Tx Tx Tx
312		*/
313	0	str[0] = T4 \| (c >> 3*Bitx);
314	0	str[1] = Tx \| ((c >> 2*Bitx) & Maskx);
315	0	str[2] = Tx \| ((c >> 1*Bitx) & Maskx);
316	0	str[3] = Tx \| (c & Maskx);
317	0	return 4;
318	0	}
319
320		int
321		runelen(Rune rune)
322	0	{
323	0	char str[10];
324
325	0	return runetochar(str, &rune);
326	0	}
327
328		int
329		runenlen(const Rune *r, int nrune)
330	0	{
331	0	int nb, c;
332
333	0	nb = 0;
334	0	while(nrune--) {
335	0	c = *r++;
336	0	if (c <= Rune1)
337	0	nb++;
338	0	else if (c <= Rune2)
339	0	nb += 2;
340	0	else if (c <= Rune3)
341	0	nb += 3;
342	0	else /* assert(c <= Rune4) */
343	0	nb += 4;
344	0	}
345	0	return nb;
346	0	}
347
348		int
349		fullrune(const char *str, int n)
350	0	{
351	0	if (n > 0) {
352	0	int c = (uchar)str;
353	0	if (c < Tx)
354	0	return 1;
355	0	if (n > 1) {
356	0	if (c < T3)
357	0	return 1;
358	0	if (n > 2) {
359	0	if (c < T4 \|\| n > 3)
360	0	return 1;
361	0	}
362	0	}
363	0	}
364	0	return 0;
365	0	}