1616import sys
1717import traceback
1818
19- QUESTION_MARK = chr (0x3f )
20-
2119# data from
2220# http://snoops.roy202.org/testerman/browser/trunk/plugins/codecs/gsm0338.py
2321
24- mapping = [
22+ def_regular_mapping = [
2523 ('\x00 ' , u'\u0040 ' ), # COMMERCIAL AT
26- ('\x00 ' , u'\u0000 ' ), # NULL (see note above)
24+ # ('\x00', u'\u0000'), # NULL (see note above)
2725 ('\x01 ' , u'\u00A3 ' ), # POUND SIGN
2826 ('\x02 ' , u'\u0024 ' ), # DOLLAR SIGN
2927 ('\x03 ' , u'\u00A5 ' ), # YEN SIGN
3230 ('\x06 ' , u'\u00F9 ' ), # LATIN SMALL LETTER U WITH GRAVE
3331 ('\x07 ' , u'\u00EC ' ), # LATIN SMALL LETTER I WITH GRAVE
3432 ('\x08 ' , u'\u00F2 ' ), # LATIN SMALL LETTER O WITH GRAVE
35- ('\x09 ' , u'\u00E7 ' ), # LATIN SMALL LETTER C WITH CEDILLA
3633 ('\x09 ' , u'\u00C7 ' ), # LATIN CAPITAL LETTER C WITH CEDILLA
34+ # The Unicode page suggests this is a mistake, but
35+ # it's still in the latest version of the spec and
36+ # our implementation has to be exact.
37+
3738 ('\x0A ' , u'\u000A ' ), # LINE FEED
3839 ('\x0B ' , u'\u00D8 ' ), # LATIN CAPITAL LETTER O WITH STROKE
3940 ('\x0C ' , u'\u00F8 ' ), # LATIN SMALL LETTER O WITH STROKE
8990 ('\x3F ' , u'\u003F ' ), # QUESTION MARK
9091 ('\x40 ' , u'\u00A1 ' ), # INVERTED EXCLAMATION MARK
9192 ('\x41 ' , u'\u0041 ' ), # LATIN CAPITAL LETTER A
92- ('\x41 ' , u'\u0391 ' ), # GREEK CAPITAL LETTER ALPHA
9393 ('\x42 ' , u'\u0042 ' ), # LATIN CAPITAL LETTER B
94- ('\x42 ' , u'\u0392 ' ), # GREEK CAPITAL LETTER BETA
9594 ('\x43 ' , u'\u0043 ' ), # LATIN CAPITAL LETTER C
9695 ('\x44 ' , u'\u0044 ' ), # LATIN CAPITAL LETTER D
9796 ('\x45 ' , u'\u0045 ' ), # LATIN CAPITAL LETTER E
98- ('\x45 ' , u'\u0395 ' ), # GREEK CAPITAL LETTER EPSILON
9997 ('\x46 ' , u'\u0046 ' ), # LATIN CAPITAL LETTER F
10098 ('\x47 ' , u'\u0047 ' ), # LATIN CAPITAL LETTER G
10199 ('\x48 ' , u'\u0048 ' ), # LATIN CAPITAL LETTER H
102- ('\x48 ' , u'\u0397 ' ), # GREEK CAPITAL LETTER ETA
103100 ('\x49 ' , u'\u0049 ' ), # LATIN CAPITAL LETTER I
104- ('\x49 ' , u'\u0399 ' ), # GREEK CAPITAL LETTER IOTA
105101 ('\x4A ' , u'\u004A ' ), # LATIN CAPITAL LETTER J
106102 ('\x4B ' , u'\u004B ' ), # LATIN CAPITAL LETTER K
107- ('\x4B ' , u'\u039A ' ), # GREEK CAPITAL LETTER KAPPA
108103 ('\x4C ' , u'\u004C ' ), # LATIN CAPITAL LETTER L
109104 ('\x4D ' , u'\u004D ' ), # LATIN CAPITAL LETTER M
110- ('\x4D ' , u'\u039C ' ), # GREEK CAPITAL LETTER MU
111105 ('\x4E ' , u'\u004E ' ), # LATIN CAPITAL LETTER N
112- ('\x4E ' , u'\u039D ' ), # GREEK CAPITAL LETTER NU
113106 ('\x4F ' , u'\u004F ' ), # LATIN CAPITAL LETTER O
114- ('\x4F ' , u'\u039F ' ), # GREEK CAPITAL LETTER OMICRON
115107 ('\x50 ' , u'\u0050 ' ), # LATIN CAPITAL LETTER P
116- ('\x50 ' , u'\u03A1 ' ), # GREEK CAPITAL LETTER RHO
117108 ('\x51 ' , u'\u0051 ' ), # LATIN CAPITAL LETTER Q
118109 ('\x52 ' , u'\u0052 ' ), # LATIN CAPITAL LETTER R
119110 ('\x53 ' , u'\u0053 ' ), # LATIN CAPITAL LETTER S
120111 ('\x54 ' , u'\u0054 ' ), # LATIN CAPITAL LETTER T
121- ('\x54 ' , u'\u03A4 ' ), # GREEK CAPITAL LETTER TAU
122112 ('\x55 ' , u'\u0055 ' ), # LATIN CAPITAL LETTER U
123113 ('\x56 ' , u'\u0056 ' ), # LATIN CAPITAL LETTER V
124114 ('\x57 ' , u'\u0057 ' ), # LATIN CAPITAL LETTER W
125115 ('\x58 ' , u'\u0058 ' ), # LATIN CAPITAL LETTER X
126- ('\x58 ' , u'\u03A7 ' ), # GREEK CAPITAL LETTER CHI
127116 ('\x59 ' , u'\u0059 ' ), # LATIN CAPITAL LETTER Y
128- ('\x59 ' , u'\u03A5 ' ), # GREEK CAPITAL LETTER UPSILON
129117 ('\x5A ' , u'\u005A ' ), # LATIN CAPITAL LETTER Z
130- ('\x5A ' , u'\u0396 ' ), # GREEK CAPITAL LETTER ZETA
131118 ('\x5B ' , u'\u00C4 ' ), # LATIN CAPITAL LETTER A WITH DIAERESIS
132119 ('\x5C ' , u'\u00D6 ' ), # LATIN CAPITAL LETTER O WITH DIAERESIS
133120 ('\x5D ' , u'\u00D1 ' ), # LATIN CAPITAL LETTER N WITH TILDE
168155]
169156
170157# Escaped characters
171- escaped_mapping = [
158+ def_escaped_mapping = [
172159 ('\x0A ' , u'\u000C ' ), # FORM FEED
173160 ('\x14 ' , u'\u005E ' ), # CIRCUMFLEX ACCENT
174161 ('\x28 ' , u'\u007B ' ), # LEFT CURLY BRACKET
181168 ('\x65 ' , u'\u20AC ' ), # EURO SIGN
182169]
183170
184- # unicode -> GSM 03.38
185- regular_encode_dict = dict ([(u , g ) for g , u in mapping ])
171+ # Replacement characters, default is question mark. Used when it is not too
172+ # important to ensure exact UTF-8 -> GSM -> UTF-8 equivilence, such as when
173+ # humans read and write SMS. But for USSD and other M2M applications it's
174+ # important to ensure the conversion is exact.
175+ def_replace_mapping = [
176+ ('\x09 ' , u'\u00E7 ' ), # LATIN SMALL LETTER C WITH CEDILLA
177+
178+ ('\x41 ' , u'\u0391 ' ), # GREEK CAPITAL LETTER ALPHA
179+ ('\x42 ' , u'\u0392 ' ), # GREEK CAPITAL LETTER BETA
180+ ('\x45 ' , u'\u0395 ' ), # GREEK CAPITAL LETTER EPSILON
181+ ('\x48 ' , u'\u0397 ' ), # GREEK CAPITAL LETTER ETA
182+ ('\x49 ' , u'\u0399 ' ), # GREEK CAPITAL LETTER IOTA
183+ ('\x4B ' , u'\u039A ' ), # GREEK CAPITAL LETTER KAPPA
184+ ('\x4D ' , u'\u039C ' ), # GREEK CAPITAL LETTER MU
185+ ('\x4E ' , u'\u039D ' ), # GREEK CAPITAL LETTER NU
186+ ('\x4F ' , u'\u039F ' ), # GREEK CAPITAL LETTER OMICRON
187+ ('\x50 ' , u'\u03A1 ' ), # GREEK CAPITAL LETTER RHO
188+ ('\x54 ' , u'\u03A4 ' ), # GREEK CAPITAL LETTER TAU
189+ ('\x58 ' , u'\u03A7 ' ), # GREEK CAPITAL LETTER CHI
190+ ('\x59 ' , u'\u03A5 ' ), # GREEK CAPITAL LETTER UPSILON
191+ ('\x5A ' , u'\u0396 ' ), # GREEK CAPITAL LETTER ZETA
192+ ]
193+
194+ QUESTION_MARK = chr (0x3f )
195+
196+ # unicode -> default GSM 03.38
197+ def_regular_encode_dict = dict ([(u , g ) for g , u in def_regular_mapping ])
186198
187- # unicode -> escaped GSM 03.38 characters
188- escape_encode_dict = dict ([(u , g ) for g , u in escaped_mapping ])
199+ # unicode -> default escaped GSM 03.38 characters
200+ def_escape_encode_dict = dict ([(u , g ) for g , u in def_escaped_mapping ])
189201
190- # GSM 03.38 -> unicode
191- # Only first corresponding unicode character is
192- # taken into account (see 0x41, etc)
193- regular_decode_dict = {}
194- for g , u in mapping :
195- if g not in regular_decode_dict :
196- regular_decode_dict [g ] = u
202+ # unicode -> default replacement characters
203+ def_replace_encode_dict = dict ([(u , g ) for g , u in def_replace_mapping ])
197204
198- escape_decode_dict = dict ([(g , u ) for g , u in escaped_mapping ])
205+ # default GSM 03.38 -> unicode
206+ # Note: We've removed the duplicates to be strict TS23.038 compliant
207+ def_regular_decode_dict = dict ([(g , u ) for g , u in def_regular_mapping ])
208+ def_escape_decode_dict = dict ([(g , u ) for g , u in def_escaped_mapping ])
199209
200210
201211def encode (input_ , errors = 'strict' ):
@@ -207,17 +217,18 @@ def encode(input_, errors='strict'):
207217 result = []
208218 for c in input_ :
209219 try :
210- result .append (regular_encode_dict [c ])
220+ result .append (def_regular_encode_dict [c ])
211221 except KeyError :
212- if c in escape_encode_dict :
222+ if c in def_escape_encode_dict :
213223 # OK, let's encode it as an escaped characters
214224 result .append ('\x1b ' )
215- result .append (escape_encode_dict [c ])
225+ result .append (def_escape_encode_dict [c ])
216226 else :
217227 if errors == 'strict' :
218- raise UnicodeError ("Invalid SMS character" )
228+ raise UnicodeError ("Invalid GSM character" )
219229 elif errors == 'replace' :
220- result .append (QUESTION_MARK )
230+ result .append (
231+ def_replace_encode_dict .get (c , QUESTION_MARK ))
221232 elif errors == 'ignore' :
222233 pass
223234 else :
@@ -242,16 +253,16 @@ def decode(input_, errors='strict'):
242253 if index < len (input_ ):
243254 c = input_ [index ]
244255 index += 1
245- result .append (escape_decode_dict .get (c , u'\xa0 ' ))
256+ result .append (def_escape_decode_dict .get (c , u'\xa0 ' ))
246257 else :
247258 result .append (u'\xa0 ' )
248259 else :
249260 try :
250- result .append (regular_decode_dict [c ])
261+ result .append (def_regular_decode_dict [c ])
251262 except KeyError :
252263 # error handling: unassigned byte, must be > 0x7f
253264 if errors == 'strict' :
254- raise UnicodeError ("Unrecognized SMS character" )
265+ raise UnicodeError ("Unrecognized GSM character" )
255266 elif errors == 'replace' :
256267 result .append ('?' )
257268 elif errors == 'ignore' :
0 commit comments