2727#define EILSEQ ENOENT
2828#endif
2929
30+ #define IS_CONTINUATION_BYTE (c ) (((c) ^ 0x80) < 0x40)
3031
3132#define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
3233#define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
5758#define HAVE_UNIDATA
5859#endif
5960
61+
62+ #if defined(HAVE_CHARSET_utf8 ) || defined(HAVE_CHARSET_utf8mb4 )
63+
64+ static inline
65+ int my_valid_mbcharlen_utf8mb3 (const uchar * s , const uchar * e )
66+ {
67+ uchar c ;
68+
69+ DBUG_ASSERT (s < e );
70+ c = s [0 ];
71+ if (c < 0x80 )
72+ return 1 ;
73+
74+ if (c < 0xc2 )
75+ return MY_CS_ILSEQ ;
76+
77+ if (c < 0xe0 )
78+ {
79+ if (s + 2 > e ) /* We need 2 characters */
80+ return MY_CS_TOOSMALL2 ;
81+
82+ if (!(IS_CONTINUATION_BYTE (s [1 ])))
83+ return MY_CS_ILSEQ ;
84+
85+ return 2 ;
86+ }
87+
88+ DBUG_ASSERT (c < 0xf0 );
89+ if (s + 3 > e ) /* We need 3 characters */
90+ return MY_CS_TOOSMALL3 ;
91+
92+ if (!(IS_CONTINUATION_BYTE (s [1 ]) && IS_CONTINUATION_BYTE (s [2 ]) &&
93+ (c >= 0xe1 || s [1 ] >= 0xa0 )))
94+ return MY_CS_ILSEQ ;
95+
96+ return 3 ;
97+ }
98+
99+ #endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/
100+
60101#ifdef HAVE_UNIDATA
61102
62103#include "my_uctype.h"
@@ -2287,7 +2328,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
22872328 if (s + 2 > e ) /* We need 2 characters */
22882329 return MY_CS_TOOSMALL2 ;
22892330
2290- if (!((s [1 ] ^ 0x80 ) < 0x40 ))
2331+ if (!(IS_CONTINUATION_BYTE (s [1 ]) ))
22912332 return MY_CS_ILSEQ ;
22922333
22932334 * pwc = ((my_wc_t ) (c & 0x1f ) << 6 ) | (my_wc_t ) (s [1 ] ^ 0x80 );
@@ -2298,7 +2339,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
22982339 if (s + 3 > e ) /* We need 3 characters */
22992340 return MY_CS_TOOSMALL3 ;
23002341
2301- if (!((s [1 ] ^ 0x80 ) < 0x40 && (s [2 ] ^ 0x80 ) < 0x40 &&
2342+ if (!(IS_CONTINUATION_BYTE (s [1 ]) && IS_CONTINUATION_BYTE (s [2 ]) &&
23022343 (c >= 0xe1 || s [1 ] >= 0xa0 )))
23032344 return MY_CS_ILSEQ ;
23042345
@@ -2314,9 +2355,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
23142355 if (s + 4 > e ) /* We need 4 characters */
23152356 return MY_CS_TOOSMALL4 ;
23162357
2317- if (!((s [1 ] ^ 0x80 ) < 0x40 &&
2318- (s [2 ] ^ 0x80 ) < 0x40 &&
2319- (s [3 ] ^ 0x80 ) < 0x40 &&
2358+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
2359+ IS_CONTINUATION_BYTE (s [2 ]) &&
2360+ IS_CONTINUATION_BYTE (s [3 ]) &&
23202361 (c >= 0xf1 || s [1 ] >= 0x90 )))
23212362 return MY_CS_ILSEQ ;
23222363
@@ -2332,10 +2373,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
23322373 if (s + 5 > e ) /* We need 5 characters */
23332374 return MY_CS_TOOSMALL5 ;
23342375
2335- if (!((s [1 ] ^ 0x80 ) < 0x40 &&
2336- (s [2 ] ^ 0x80 ) < 0x40 &&
2337- (s [3 ] ^ 0x80 ) < 0x40 &&
2338- (s [4 ] ^ 0x80 ) < 0x40 &&
2376+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
2377+ IS_CONTINUATION_BYTE (s [2 ]) &&
2378+ IS_CONTINUATION_BYTE (s [3 ]) &&
2379+ IS_CONTINUATION_BYTE (s [4 ]) &&
23392380 (c >= 0xf9 || s [1 ] >= 0x88 )))
23402381 return MY_CS_ILSEQ ;
23412382
@@ -2351,11 +2392,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
23512392 if ( s + 6 > e ) /* We need 6 characters */
23522393 return MY_CS_TOOSMALL6 ;
23532394
2354- if (!((s [1 ] ^ 0x80 ) < 0x40 &&
2355- (s [2 ] ^ 0x80 ) < 0x40 &&
2356- (s [3 ] ^ 0x80 ) < 0x40 &&
2357- (s [4 ] ^ 0x80 ) < 0x40 &&
2358- (s [5 ] ^ 0x80 ) < 0x40 &&
2395+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
2396+ IS_CONTINUATION_BYTE (s [2 ]) &&
2397+ IS_CONTINUATION_BYTE (s [3 ]) &&
2398+ IS_CONTINUATION_BYTE (s [4 ]) &&
2399+ IS_CONTINUATION_BYTE (s [5 ]) &&
23592400 (c >= 0xfd || s [1 ] >= 0x84 )))
23602401 return MY_CS_ILSEQ ;
23612402
@@ -2399,11 +2440,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
23992440 * pwc = ((my_wc_t ) (c & 0x1f ) << 6 ) | (my_wc_t ) (s [1 ] ^ 0x80 );
24002441 return 2 ;
24012442 }
2402-
2443+
24032444 if (c < 0xf0 )
24042445 {
2405- if (!((s [1 ] ^ 0x80 ) < 0x40 &&
2406- (s [2 ] ^ 0x80 ) < 0x40 &&
2446+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
2447+ IS_CONTINUATION_BYTE (s [2 ]) &&
24072448 (c >= 0xe1 || s [1 ] >= 0xa0 )))
24082449 return MY_CS_ILSEQ ;
24092450
@@ -2892,10 +2933,90 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
28922933}
28932934
28942935
2936+ static
2937+ int my_valid_mbcharlen_utf8 (CHARSET_INFO * cs __attribute__((unused )),
2938+ const uchar * s , const uchar * e )
2939+ {
2940+ uchar c ;
2941+
2942+ if (s >= e )
2943+ return MY_CS_TOOSMALL ;
2944+
2945+ c = s [0 ];
2946+ if (c < 0xf0 )
2947+ return my_valid_mbcharlen_utf8mb3 (s , e );
2948+
2949+ #ifdef UNICODE_32BIT
2950+ if (c < 0xf8 && sizeof (my_wc_t )* 8 >= 32 )
2951+ {
2952+ if (s + 4 > e ) /* We need 4 characters */
2953+ return MY_CS_TOOSMALL4 ;
2954+
2955+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
2956+ IS_CONTINUATION_BYTE (s [2 ]) &&
2957+ IS_CONTINUATION_BYTE (s [3 ]) &&
2958+ (c >= 0xf1 || s [1 ] >= 0x90 )))
2959+ return MY_CS_ILSEQ ;
2960+
2961+ return 4 ;
2962+ }
2963+ if (c < 0xfc && sizeof (my_wc_t )* 8 >= 32 )
2964+ {
2965+ if (s + 5 > e ) /* We need 5 characters */
2966+ return MY_CS_TOOSMALL5 ;
2967+
2968+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
2969+ IS_CONTINUATION_BYTE (s [2 ]) &&
2970+ IS_CONTINUATION_BYTE (s [3 ]) &&
2971+ IS_CONTINUATION_BYTE (s [4 ]) &&
2972+ (c >= 0xf9 || s [1 ] >= 0x88 )))
2973+ return MY_CS_ILSEQ ;
2974+
2975+ return 5 ;
2976+ }
2977+ if (c < 0xfe && sizeof (my_wc_t )* 8 >= 32 )
2978+ {
2979+ if ( s + 6 > e ) /* We need 6 characters */
2980+ return MY_CS_TOOSMALL6 ;
2981+
2982+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
2983+ IS_CONTINUATION_BYTE (s [2 ]) &&
2984+ IS_CONTINUATION_BYTE (s [3 ]) &&
2985+ IS_CONTINUATION_BYTE (s [4 ]) &&
2986+ IS_CONTINUATION_BYTE (s [5 ]) &&
2987+ (c >= 0xfd || s [1 ] >= 0x84 )))
2988+ return MY_CS_ILSEQ ;
2989+
2990+ return 6 ;
2991+ }
2992+ #endif
2993+ return MY_CS_ILSEQ ;
2994+ }
2995+
2996+ static size_t
2997+ my_well_formed_len_utf8 (CHARSET_INFO * cs , const char * b , const char * e ,
2998+ size_t pos , int * error )
2999+ {
3000+ const char * b_start = b ;
3001+ * error = 0 ;
3002+ while (pos )
3003+ {
3004+ int mb_len ;
3005+
3006+ if ((mb_len = my_valid_mbcharlen_utf8 (cs , (uchar * ) b , (uchar * ) e )) <= 0 )
3007+ {
3008+ * error = b < e ? 1 : 0 ;
3009+ break ;
3010+ }
3011+ b += mb_len ;
3012+ pos -- ;
3013+ }
3014+ return (size_t ) (b - b_start );
3015+ }
3016+
28953017static uint my_ismbchar_utf8 (CHARSET_INFO * cs ,const char * b , const char * e )
28963018{
2897- my_wc_t wc ;
2898- int res = my_utf8_uni (cs ,& wc , (const uchar * )b , (const uchar * )e );
3019+ int res = my_valid_mbcharlen_utf8 (cs , (const uchar * )b , (const uchar * )e );
28993020 return (res > 1 ) ? res : 0 ;
29003021}
29013022
@@ -2944,7 +3065,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
29443065 my_mbcharlen_utf8 ,
29453066 my_numchars_mb ,
29463067 my_charpos_mb ,
2947- my_well_formed_len_mb ,
3068+ my_well_formed_len_utf8 ,
29483069 my_lengthsp_8bit ,
29493070 my_numcells_mb ,
29503071 my_utf8_uni ,
@@ -4714,7 +4835,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
47144835 if (s + 2 > e ) /* We need 2 characters */
47154836 return MY_CS_TOOSMALL2 ;
47164837
4717- if (!((s [1 ] ^ 0x80 ) < 0x40 ))
4838+ if (!(IS_CONTINUATION_BYTE (s [1 ]) ))
47184839 return MY_CS_ILSEQ ;
47194840
47204841 * pwc = ((my_wc_t ) (c & 0x1f ) << 6 ) | (my_wc_t ) (s [1 ] ^ 0x80 );
@@ -4725,7 +4846,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
47254846 if (s + 3 > e ) /* We need 3 characters */
47264847 return MY_CS_TOOSMALL3 ;
47274848
4728- if (!((s [1 ] ^ 0x80 ) < 0x40 && (s [2 ] ^ 0x80 ) < 0x40 &&
4849+ if (!(IS_CONTINUATION_BYTE (s [1 ]) && IS_CONTINUATION_BYTE (s [2 ]) &&
47294850 (c >= 0xe1 || s [1 ] >= 0xa0 )))
47304851 return MY_CS_ILSEQ ;
47314852
@@ -4758,9 +4879,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
47584879 [F4][80..8F][80..BF][80..BF]
47594880 */
47604881
4761- if (!((s [1 ] ^ 0x80 ) < 0x40 &&
4762- (s [2 ] ^ 0x80 ) < 0x40 &&
4763- (s [3 ] ^ 0x80 ) < 0x40 &&
4882+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
4883+ IS_CONTINUATION_BYTE (s [2 ]) &&
4884+ IS_CONTINUATION_BYTE (s [3 ]) &&
47644885 (c >= 0xf1 || s [1 ] >= 0x90 ) &&
47654886 (c <= 0xf3 || s [1 ] <= 0x8F )))
47664887 return MY_CS_ILSEQ ;
@@ -4796,17 +4917,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
47964917
47974918 if (c < 0xe0 )
47984919 {
4799- if (!(( s [1 ] ^ 0x80 ) < 0x40 ))
4920+ if (!IS_CONTINUATION_BYTE ( s [1 ]))
48004921 return MY_CS_ILSEQ ;
48014922
48024923 * pwc = ((my_wc_t ) (c & 0x1f ) << 6 ) | (my_wc_t ) (s [1 ] ^ 0x80 );
48034924 return 2 ;
48044925 }
4805-
4926+
48064927 if (c < 0xf0 )
48074928 {
4808- if (!((s [1 ] ^ 0x80 ) < 0x40 &&
4809- (s [2 ] ^ 0x80 ) < 0x40 &&
4929+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
4930+ IS_CONTINUATION_BYTE (s [2 ]) &&
48104931 (c >= 0xe1 || s [1 ] >= 0xa0 )))
48114932 return MY_CS_ILSEQ ;
48124933 * pwc = ((my_wc_t ) (c & 0x0f ) << 12 ) |
@@ -4817,9 +4938,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
48174938 }
48184939 else if (c < 0xf5 )
48194940 {
4820- if (!((s [1 ] ^ 0x80 ) < 0x40 &&
4821- (s [2 ] ^ 0x80 ) < 0x40 &&
4822- (s [3 ] ^ 0x80 ) < 0x40 &&
4941+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
4942+ IS_CONTINUATION_BYTE (s [2 ]) &&
4943+ IS_CONTINUATION_BYTE (s [3 ]) &&
48234944 (c >= 0xf1 || s [1 ] >= 0x90 ) &&
48244945 (c <= 0xf3 || s [1 ] <= 0x8F )))
48254946 return MY_CS_ILSEQ ;
@@ -5308,11 +5429,84 @@ my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len)
53085429}
53095430
53105431
5432+ static int
5433+ my_valid_mbcharlen_utf8mb4 (CHARSET_INFO * cs __attribute__((unused )),
5434+ const uchar * s , const uchar * e )
5435+ {
5436+ uchar c ;
5437+
5438+ if (s >= e )
5439+ return MY_CS_TOOSMALL ;
5440+
5441+ c = s [0 ];
5442+ if (c < 0xf0 )
5443+ return my_valid_mbcharlen_utf8mb3 (s , e );
5444+
5445+ if (c < 0xf5 )
5446+ {
5447+ if (s + 4 > e ) /* We need 4 characters */
5448+ return MY_CS_TOOSMALL4 ;
5449+
5450+ /*
5451+ UTF-8 quick four-byte mask:
5452+ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5453+ Encoding allows to encode U+00010000..U+001FFFFF
5454+
5455+ The maximum character defined in the Unicode standard is U+0010FFFF.
5456+ Higher characters U+00110000..U+001FFFFF are not used.
5457+
5458+ 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
5459+ 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
5460+
5461+ Valid codes:
5462+ [F0][90..BF][80..BF][80..BF]
5463+ [F1][80..BF][80..BF][80..BF]
5464+ [F2][80..BF][80..BF][80..BF]
5465+ [F3][80..BF][80..BF][80..BF]
5466+ [F4][80..8F][80..BF][80..BF]
5467+ */
5468+
5469+ if (!(IS_CONTINUATION_BYTE (s [1 ]) &&
5470+ IS_CONTINUATION_BYTE (s [2 ]) &&
5471+ IS_CONTINUATION_BYTE (s [3 ]) &&
5472+ (c >= 0xf1 || s [1 ] >= 0x90 ) &&
5473+ (c <= 0xf3 || s [1 ] <= 0x8F )))
5474+ return MY_CS_ILSEQ ;
5475+
5476+ return 4 ;
5477+ }
5478+
5479+ return MY_CS_ILSEQ ;
5480+ }
5481+
5482+
5483+ static
5484+ size_t my_well_formed_len_utf8mb4 (CHARSET_INFO * cs ,
5485+ const char * b , const char * e ,
5486+ size_t pos , int * error )
5487+ {
5488+ const char * b_start = b ;
5489+ * error = 0 ;
5490+ while (pos )
5491+ {
5492+ int mb_len ;
5493+
5494+ if ((mb_len = my_valid_mbcharlen_utf8mb4 (cs , (uchar * ) b , (uchar * ) e )) <= 0 )
5495+ {
5496+ * error = b < e ? 1 : 0 ;
5497+ break ;
5498+ }
5499+ b += mb_len ;
5500+ pos -- ;
5501+ }
5502+ return (size_t ) (b - b_start );
5503+ }
5504+
5505+
53115506static uint
53125507my_ismbchar_utf8mb4 (CHARSET_INFO * cs , const char * b , const char * e )
53135508{
5314- my_wc_t wc ;
5315- int res = my_mb_wc_utf8mb4 (cs ,& wc , (const uchar * )b , (const uchar * )e );
5509+ int res = my_valid_mbcharlen_utf8mb4 (cs , (const uchar * )b , (const uchar * )e );
53165510 return (res > 1 ) ? res : 0 ;
53175511}
53185512
@@ -5373,7 +5567,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
53735567 my_mbcharlen_utf8mb4 ,
53745568 my_numchars_mb ,
53755569 my_charpos_mb ,
5376- my_well_formed_len_mb ,
5570+ my_well_formed_len_utf8mb4 ,
53775571 my_lengthsp_8bit ,
53785572 my_numcells_mb ,
53795573 my_mb_wc_utf8mb4 ,
0 commit comments