Skip to content

Commit 8934a80

Browse files
committed
Bug #14057034 : WASTED CPU CYCLES IN MY_UTF8_UNI WHERE
RESULTING MY_WC_T RESULT IS NOT USED Issue : handler functions my_ismbchar_utf8, my_well_formed_len_mb for charset utf8 is calling unicode converion function to validate and to find the character length. Because of this, instructions which will convert the utf8 to unicode are executed for no use. A similar issue exist with charset utf8mb4 Solution : reorganized the code such that character validation part of unicode conversion handler is extracted(duplicated) in to separate function. Hence my_ismbchar_utf8, my_well_formed_len_mb will call the new function which only validates and return the length of mb(utf8). A similar fix for charset utf8mb4.
1 parent aec0856 commit 8934a80

1 file changed

Lines changed: 229 additions & 35 deletions

File tree

strings/ctype-utf8.c

Lines changed: 229 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#define EILSEQ ENOENT
2828
#endif
2929

30+
#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40)
3031

3132
#define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
3233
#define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
@@ -57,6 +58,46 @@
5758
#define HAVE_UNIDATA
5859
#endif
5960

61+
62+
#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4)
63+
64+
static inline
65+
int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e)
66+
{
67+
uchar c;
68+
69+
DBUG_ASSERT(s < e);
70+
c= s[0];
71+
if (c < 0x80)
72+
return 1;
73+
74+
if (c < 0xc2)
75+
return MY_CS_ILSEQ;
76+
77+
if (c < 0xe0)
78+
{
79+
if (s+2 > e) /* We need 2 characters */
80+
return MY_CS_TOOSMALL2;
81+
82+
if (!(IS_CONTINUATION_BYTE(s[1])))
83+
return MY_CS_ILSEQ;
84+
85+
return 2;
86+
}
87+
88+
DBUG_ASSERT(c < 0xf0);
89+
if (s+3 > e) /* We need 3 characters */
90+
return MY_CS_TOOSMALL3;
91+
92+
if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
93+
(c >= 0xe1 || s[1] >= 0xa0)))
94+
return MY_CS_ILSEQ;
95+
96+
return 3;
97+
}
98+
99+
#endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/
100+
60101
#ifdef HAVE_UNIDATA
61102

62103
#include "my_uctype.h"
@@ -2287,7 +2328,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
22872328
if (s+2 > e) /* We need 2 characters */
22882329
return MY_CS_TOOSMALL2;
22892330

2290-
if (!((s[1] ^ 0x80) < 0x40))
2331+
if (!(IS_CONTINUATION_BYTE(s[1])))
22912332
return MY_CS_ILSEQ;
22922333

22932334
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@@ -2298,7 +2339,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
22982339
if (s+3 > e) /* We need 3 characters */
22992340
return MY_CS_TOOSMALL3;
23002341

2301-
if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
2342+
if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
23022343
(c >= 0xe1 || s[1] >= 0xa0)))
23032344
return MY_CS_ILSEQ;
23042345

@@ -2314,9 +2355,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
23142355
if (s+4 > e) /* We need 4 characters */
23152356
return MY_CS_TOOSMALL4;
23162357

2317-
if (!((s[1] ^ 0x80) < 0x40 &&
2318-
(s[2] ^ 0x80) < 0x40 &&
2319-
(s[3] ^ 0x80) < 0x40 &&
2358+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
2359+
IS_CONTINUATION_BYTE(s[2]) &&
2360+
IS_CONTINUATION_BYTE(s[3]) &&
23202361
(c >= 0xf1 || s[1] >= 0x90)))
23212362
return MY_CS_ILSEQ;
23222363

@@ -2332,10 +2373,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
23322373
if (s+5 >e) /* We need 5 characters */
23332374
return MY_CS_TOOSMALL5;
23342375

2335-
if (!((s[1] ^ 0x80) < 0x40 &&
2336-
(s[2] ^ 0x80) < 0x40 &&
2337-
(s[3] ^ 0x80) < 0x40 &&
2338-
(s[4] ^ 0x80) < 0x40 &&
2376+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
2377+
IS_CONTINUATION_BYTE(s[2]) &&
2378+
IS_CONTINUATION_BYTE(s[3]) &&
2379+
IS_CONTINUATION_BYTE(s[4]) &&
23392380
(c >= 0xf9 || s[1] >= 0x88)))
23402381
return MY_CS_ILSEQ;
23412382

@@ -2351,11 +2392,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
23512392
if ( s+6 >e ) /* We need 6 characters */
23522393
return MY_CS_TOOSMALL6;
23532394

2354-
if (!((s[1] ^ 0x80) < 0x40 &&
2355-
(s[2] ^ 0x80) < 0x40 &&
2356-
(s[3] ^ 0x80) < 0x40 &&
2357-
(s[4] ^ 0x80) < 0x40 &&
2358-
(s[5] ^ 0x80) < 0x40 &&
2395+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
2396+
IS_CONTINUATION_BYTE(s[2]) &&
2397+
IS_CONTINUATION_BYTE(s[3]) &&
2398+
IS_CONTINUATION_BYTE(s[4]) &&
2399+
IS_CONTINUATION_BYTE(s[5]) &&
23592400
(c >= 0xfd || s[1] >= 0x84)))
23602401
return MY_CS_ILSEQ;
23612402

@@ -2399,11 +2440,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
23992440
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
24002441
return 2;
24012442
}
2402-
2443+
24032444
if (c < 0xf0)
24042445
{
2405-
if (!((s[1] ^ 0x80) < 0x40 &&
2406-
(s[2] ^ 0x80) < 0x40 &&
2446+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
2447+
IS_CONTINUATION_BYTE(s[2]) &&
24072448
(c >= 0xe1 || s[1] >= 0xa0)))
24082449
return MY_CS_ILSEQ;
24092450

@@ -2892,10 +2933,90 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
28922933
}
28932934

28942935

2936+
static
2937+
int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
2938+
const uchar *s, const uchar *e)
2939+
{
2940+
uchar c;
2941+
2942+
if (s >= e)
2943+
return MY_CS_TOOSMALL;
2944+
2945+
c= s[0];
2946+
if (c < 0xf0)
2947+
return my_valid_mbcharlen_utf8mb3(s, e);
2948+
2949+
#ifdef UNICODE_32BIT
2950+
if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
2951+
{
2952+
if (s+4 > e) /* We need 4 characters */
2953+
return MY_CS_TOOSMALL4;
2954+
2955+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
2956+
IS_CONTINUATION_BYTE(s[2]) &&
2957+
IS_CONTINUATION_BYTE(s[3]) &&
2958+
(c >= 0xf1 || s[1] >= 0x90)))
2959+
return MY_CS_ILSEQ;
2960+
2961+
return 4;
2962+
}
2963+
if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
2964+
{
2965+
if (s+5 >e) /* We need 5 characters */
2966+
return MY_CS_TOOSMALL5;
2967+
2968+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
2969+
IS_CONTINUATION_BYTE(s[2]) &&
2970+
IS_CONTINUATION_BYTE(s[3]) &&
2971+
IS_CONTINUATION_BYTE(s[4]) &&
2972+
(c >= 0xf9 || s[1] >= 0x88)))
2973+
return MY_CS_ILSEQ;
2974+
2975+
return 5;
2976+
}
2977+
if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
2978+
{
2979+
if ( s+6 >e ) /* We need 6 characters */
2980+
return MY_CS_TOOSMALL6;
2981+
2982+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
2983+
IS_CONTINUATION_BYTE(s[2]) &&
2984+
IS_CONTINUATION_BYTE(s[3]) &&
2985+
IS_CONTINUATION_BYTE(s[4]) &&
2986+
IS_CONTINUATION_BYTE(s[5]) &&
2987+
(c >= 0xfd || s[1] >= 0x84)))
2988+
return MY_CS_ILSEQ;
2989+
2990+
return 6;
2991+
}
2992+
#endif
2993+
return MY_CS_ILSEQ;
2994+
}
2995+
2996+
static size_t
2997+
my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
2998+
size_t pos, int *error)
2999+
{
3000+
const char *b_start= b;
3001+
*error= 0;
3002+
while (pos)
3003+
{
3004+
int mb_len;
3005+
3006+
if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
3007+
{
3008+
*error= b < e ? 1 : 0;
3009+
break;
3010+
}
3011+
b+= mb_len;
3012+
pos--;
3013+
}
3014+
return (size_t) (b - b_start);
3015+
}
3016+
28953017
static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
28963018
{
2897-
my_wc_t wc;
2898-
int res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e);
3019+
int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e);
28993020
return (res>1) ? res : 0;
29003021
}
29013022

@@ -2944,7 +3065,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
29443065
my_mbcharlen_utf8,
29453066
my_numchars_mb,
29463067
my_charpos_mb,
2947-
my_well_formed_len_mb,
3068+
my_well_formed_len_utf8,
29483069
my_lengthsp_8bit,
29493070
my_numcells_mb,
29503071
my_utf8_uni,
@@ -4714,7 +4835,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
47144835
if (s + 2 > e) /* We need 2 characters */
47154836
return MY_CS_TOOSMALL2;
47164837

4717-
if (!((s[1] ^ 0x80) < 0x40))
4838+
if (!(IS_CONTINUATION_BYTE(s[1])))
47184839
return MY_CS_ILSEQ;
47194840

47204841
*pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@@ -4725,7 +4846,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
47254846
if (s + 3 > e) /* We need 3 characters */
47264847
return MY_CS_TOOSMALL3;
47274848

4728-
if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
4849+
if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
47294850
(c >= 0xe1 || s[1] >= 0xa0)))
47304851
return MY_CS_ILSEQ;
47314852

@@ -4758,9 +4879,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
47584879
[F4][80..8F][80..BF][80..BF]
47594880
*/
47604881

4761-
if (!((s[1] ^ 0x80) < 0x40 &&
4762-
(s[2] ^ 0x80) < 0x40 &&
4763-
(s[3] ^ 0x80) < 0x40 &&
4882+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
4883+
IS_CONTINUATION_BYTE(s[2]) &&
4884+
IS_CONTINUATION_BYTE(s[3]) &&
47644885
(c >= 0xf1 || s[1] >= 0x90) &&
47654886
(c <= 0xf3 || s[1] <= 0x8F)))
47664887
return MY_CS_ILSEQ;
@@ -4796,17 +4917,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
47964917

47974918
if (c < 0xe0)
47984919
{
4799-
if (!((s[1] ^ 0x80) < 0x40))
4920+
if (!IS_CONTINUATION_BYTE(s[1]))
48004921
return MY_CS_ILSEQ;
48014922

48024923
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
48034924
return 2;
48044925
}
4805-
4926+
48064927
if (c < 0xf0)
48074928
{
4808-
if (!((s[1] ^ 0x80) < 0x40 &&
4809-
(s[2] ^ 0x80) < 0x40 &&
4929+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
4930+
IS_CONTINUATION_BYTE(s[2]) &&
48104931
(c >= 0xe1 || s[1] >= 0xa0)))
48114932
return MY_CS_ILSEQ;
48124933
*pwc= ((my_wc_t) (c & 0x0f) << 12) |
@@ -4817,9 +4938,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
48174938
}
48184939
else if (c < 0xf5)
48194940
{
4820-
if (!((s[1] ^ 0x80) < 0x40 &&
4821-
(s[2] ^ 0x80) < 0x40 &&
4822-
(s[3] ^ 0x80) < 0x40 &&
4941+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
4942+
IS_CONTINUATION_BYTE(s[2]) &&
4943+
IS_CONTINUATION_BYTE(s[3]) &&
48234944
(c >= 0xf1 || s[1] >= 0x90) &&
48244945
(c <= 0xf3 || s[1] <= 0x8F)))
48254946
return MY_CS_ILSEQ;
@@ -5308,11 +5429,84 @@ my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len)
53085429
}
53095430

53105431

5432+
static int
5433+
my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
5434+
const uchar *s, const uchar *e)
5435+
{
5436+
uchar c;
5437+
5438+
if (s >= e)
5439+
return MY_CS_TOOSMALL;
5440+
5441+
c= s[0];
5442+
if (c < 0xf0)
5443+
return my_valid_mbcharlen_utf8mb3(s, e);
5444+
5445+
if (c < 0xf5)
5446+
{
5447+
if (s + 4 > e) /* We need 4 characters */
5448+
return MY_CS_TOOSMALL4;
5449+
5450+
/*
5451+
UTF-8 quick four-byte mask:
5452+
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5453+
Encoding allows to encode U+00010000..U+001FFFFF
5454+
5455+
The maximum character defined in the Unicode standard is U+0010FFFF.
5456+
Higher characters U+00110000..U+001FFFFF are not used.
5457+
5458+
11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
5459+
11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
5460+
5461+
Valid codes:
5462+
[F0][90..BF][80..BF][80..BF]
5463+
[F1][80..BF][80..BF][80..BF]
5464+
[F2][80..BF][80..BF][80..BF]
5465+
[F3][80..BF][80..BF][80..BF]
5466+
[F4][80..8F][80..BF][80..BF]
5467+
*/
5468+
5469+
if (!(IS_CONTINUATION_BYTE(s[1]) &&
5470+
IS_CONTINUATION_BYTE(s[2]) &&
5471+
IS_CONTINUATION_BYTE(s[3]) &&
5472+
(c >= 0xf1 || s[1] >= 0x90) &&
5473+
(c <= 0xf3 || s[1] <= 0x8F)))
5474+
return MY_CS_ILSEQ;
5475+
5476+
return 4;
5477+
}
5478+
5479+
return MY_CS_ILSEQ;
5480+
}
5481+
5482+
5483+
static
5484+
size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
5485+
const char *b, const char *e,
5486+
size_t pos, int *error)
5487+
{
5488+
const char *b_start= b;
5489+
*error= 0;
5490+
while (pos)
5491+
{
5492+
int mb_len;
5493+
5494+
if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
5495+
{
5496+
*error= b < e ? 1 : 0;
5497+
break;
5498+
}
5499+
b+= mb_len;
5500+
pos--;
5501+
}
5502+
return (size_t) (b - b_start);
5503+
}
5504+
5505+
53115506
static uint
53125507
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
53135508
{
5314-
my_wc_t wc;
5315-
int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e);
5509+
int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e);
53165510
return (res > 1) ? res : 0;
53175511
}
53185512

@@ -5373,7 +5567,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
53735567
my_mbcharlen_utf8mb4,
53745568
my_numchars_mb,
53755569
my_charpos_mb,
5376-
my_well_formed_len_mb,
5570+
my_well_formed_len_utf8mb4,
53775571
my_lengthsp_8bit,
53785572
my_numcells_mb,
53795573
my_mb_wc_utf8mb4,

0 commit comments

Comments
 (0)