Skip to content

Commit 9f85446

Browse files
committed
GSM encoding, ensure compliance with TS23.038
Change the Greek mapping to similar looking Latin characters to only occur if errors='replace'. This should ensure that a roundtrip of UTF-8->GSM->UTF-8 is exact.
1 parent ca9c6f0 commit 9f85446

1 file changed

Lines changed: 51 additions & 40 deletions

File tree

messaging/sms/gsm0338.py

Lines changed: 51 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,12 @@
1616
import sys
1717
import traceback
1818

19-
QUESTION_MARK = chr(0x3f)
20-
2119
# data from
2220
# http://snoops.roy202.org/testerman/browser/trunk/plugins/codecs/gsm0338.py
2321

24-
mapping = [
22+
def_regular_mapping = [
2523
('\x00', u'\u0040'), # COMMERCIAL AT
26-
('\x00', u'\u0000'), # NULL (see note above)
24+
# ('\x00', u'\u0000'), # NULL (see note above)
2725
('\x01', u'\u00A3'), # POUND SIGN
2826
('\x02', u'\u0024'), # DOLLAR SIGN
2927
('\x03', u'\u00A5'), # YEN SIGN
@@ -32,8 +30,11 @@
3230
('\x06', u'\u00F9'), # LATIN SMALL LETTER U WITH GRAVE
3331
('\x07', u'\u00EC'), # LATIN SMALL LETTER I WITH GRAVE
3432
('\x08', u'\u00F2'), # LATIN SMALL LETTER O WITH GRAVE
35-
('\x09', u'\u00E7'), # LATIN SMALL LETTER C WITH CEDILLA
3633
('\x09', u'\u00C7'), # LATIN CAPITAL LETTER C WITH CEDILLA
34+
# The Unicode page suggests this is a mistake, but
35+
# it's still in the latest version of the spec and
36+
# our implementation has to be exact.
37+
3738
('\x0A', u'\u000A'), # LINE FEED
3839
('\x0B', u'\u00D8'), # LATIN CAPITAL LETTER O WITH STROKE
3940
('\x0C', u'\u00F8'), # LATIN SMALL LETTER O WITH STROKE
@@ -89,45 +90,31 @@
8990
('\x3F', u'\u003F'), # QUESTION MARK
9091
('\x40', u'\u00A1'), # INVERTED EXCLAMATION MARK
9192
('\x41', u'\u0041'), # LATIN CAPITAL LETTER A
92-
('\x41', u'\u0391'), # GREEK CAPITAL LETTER ALPHA
9393
('\x42', u'\u0042'), # LATIN CAPITAL LETTER B
94-
('\x42', u'\u0392'), # GREEK CAPITAL LETTER BETA
9594
('\x43', u'\u0043'), # LATIN CAPITAL LETTER C
9695
('\x44', u'\u0044'), # LATIN CAPITAL LETTER D
9796
('\x45', u'\u0045'), # LATIN CAPITAL LETTER E
98-
('\x45', u'\u0395'), # GREEK CAPITAL LETTER EPSILON
9997
('\x46', u'\u0046'), # LATIN CAPITAL LETTER F
10098
('\x47', u'\u0047'), # LATIN CAPITAL LETTER G
10199
('\x48', u'\u0048'), # LATIN CAPITAL LETTER H
102-
('\x48', u'\u0397'), # GREEK CAPITAL LETTER ETA
103100
('\x49', u'\u0049'), # LATIN CAPITAL LETTER I
104-
('\x49', u'\u0399'), # GREEK CAPITAL LETTER IOTA
105101
('\x4A', u'\u004A'), # LATIN CAPITAL LETTER J
106102
('\x4B', u'\u004B'), # LATIN CAPITAL LETTER K
107-
('\x4B', u'\u039A'), # GREEK CAPITAL LETTER KAPPA
108103
('\x4C', u'\u004C'), # LATIN CAPITAL LETTER L
109104
('\x4D', u'\u004D'), # LATIN CAPITAL LETTER M
110-
('\x4D', u'\u039C'), # GREEK CAPITAL LETTER MU
111105
('\x4E', u'\u004E'), # LATIN CAPITAL LETTER N
112-
('\x4E', u'\u039D'), # GREEK CAPITAL LETTER NU
113106
('\x4F', u'\u004F'), # LATIN CAPITAL LETTER O
114-
('\x4F', u'\u039F'), # GREEK CAPITAL LETTER OMICRON
115107
('\x50', u'\u0050'), # LATIN CAPITAL LETTER P
116-
('\x50', u'\u03A1'), # GREEK CAPITAL LETTER RHO
117108
('\x51', u'\u0051'), # LATIN CAPITAL LETTER Q
118109
('\x52', u'\u0052'), # LATIN CAPITAL LETTER R
119110
('\x53', u'\u0053'), # LATIN CAPITAL LETTER S
120111
('\x54', u'\u0054'), # LATIN CAPITAL LETTER T
121-
('\x54', u'\u03A4'), # GREEK CAPITAL LETTER TAU
122112
('\x55', u'\u0055'), # LATIN CAPITAL LETTER U
123113
('\x56', u'\u0056'), # LATIN CAPITAL LETTER V
124114
('\x57', u'\u0057'), # LATIN CAPITAL LETTER W
125115
('\x58', u'\u0058'), # LATIN CAPITAL LETTER X
126-
('\x58', u'\u03A7'), # GREEK CAPITAL LETTER CHI
127116
('\x59', u'\u0059'), # LATIN CAPITAL LETTER Y
128-
('\x59', u'\u03A5'), # GREEK CAPITAL LETTER UPSILON
129117
('\x5A', u'\u005A'), # LATIN CAPITAL LETTER Z
130-
('\x5A', u'\u0396'), # GREEK CAPITAL LETTER ZETA
131118
('\x5B', u'\u00C4'), # LATIN CAPITAL LETTER A WITH DIAERESIS
132119
('\x5C', u'\u00D6'), # LATIN CAPITAL LETTER O WITH DIAERESIS
133120
('\x5D', u'\u00D1'), # LATIN CAPITAL LETTER N WITH TILDE
@@ -168,7 +155,7 @@
168155
]
169156

170157
# Escaped characters
171-
escaped_mapping = [
158+
def_escaped_mapping = [
172159
('\x0A', u'\u000C'), # FORM FEED
173160
('\x14', u'\u005E'), # CIRCUMFLEX ACCENT
174161
('\x28', u'\u007B'), # LEFT CURLY BRACKET
@@ -181,21 +168,44 @@
181168
('\x65', u'\u20AC'), # EURO SIGN
182169
]
183170

184-
# unicode -> GSM 03.38
185-
regular_encode_dict = dict([(u, g) for g, u in mapping])
171+
# Replacement characters, default is question mark. Used when it is not too
172+
# important to ensure exact UTF-8 -> GSM -> UTF-8 equivilence, such as when
173+
# humans read and write SMS. But for USSD and other M2M applications it's
174+
# important to ensure the conversion is exact.
175+
def_replace_mapping = [
176+
('\x09', u'\u00E7'), # LATIN SMALL LETTER C WITH CEDILLA
177+
178+
('\x41', u'\u0391'), # GREEK CAPITAL LETTER ALPHA
179+
('\x42', u'\u0392'), # GREEK CAPITAL LETTER BETA
180+
('\x45', u'\u0395'), # GREEK CAPITAL LETTER EPSILON
181+
('\x48', u'\u0397'), # GREEK CAPITAL LETTER ETA
182+
('\x49', u'\u0399'), # GREEK CAPITAL LETTER IOTA
183+
('\x4B', u'\u039A'), # GREEK CAPITAL LETTER KAPPA
184+
('\x4D', u'\u039C'), # GREEK CAPITAL LETTER MU
185+
('\x4E', u'\u039D'), # GREEK CAPITAL LETTER NU
186+
('\x4F', u'\u039F'), # GREEK CAPITAL LETTER OMICRON
187+
('\x50', u'\u03A1'), # GREEK CAPITAL LETTER RHO
188+
('\x54', u'\u03A4'), # GREEK CAPITAL LETTER TAU
189+
('\x58', u'\u03A7'), # GREEK CAPITAL LETTER CHI
190+
('\x59', u'\u03A5'), # GREEK CAPITAL LETTER UPSILON
191+
('\x5A', u'\u0396'), # GREEK CAPITAL LETTER ZETA
192+
]
193+
194+
QUESTION_MARK = chr(0x3f)
195+
196+
# unicode -> default GSM 03.38
197+
def_regular_encode_dict = dict([(u, g) for g, u in def_regular_mapping])
186198

187-
# unicode -> escaped GSM 03.38 characters
188-
escape_encode_dict = dict([(u, g) for g, u in escaped_mapping])
199+
# unicode -> default escaped GSM 03.38 characters
200+
def_escape_encode_dict = dict([(u, g) for g, u in def_escaped_mapping])
189201

190-
# GSM 03.38 -> unicode
191-
# Only first corresponding unicode character is
192-
# taken into account (see 0x41, etc)
193-
regular_decode_dict = {}
194-
for g, u in mapping:
195-
if g not in regular_decode_dict:
196-
regular_decode_dict[g] = u
202+
# unicode -> default replacement characters
203+
def_replace_encode_dict = dict([(u, g) for g, u in def_replace_mapping])
197204

198-
escape_decode_dict = dict([(g, u) for g, u in escaped_mapping])
205+
# default GSM 03.38 -> unicode
206+
# Note: We've removed the duplicates to be strict TS23.038 compliant
207+
def_regular_decode_dict = dict([(g, u) for g, u in def_regular_mapping])
208+
def_escape_decode_dict = dict([(g, u) for g, u in def_escaped_mapping])
199209

200210

201211
def encode(input_, errors='strict'):
@@ -207,17 +217,18 @@ def encode(input_, errors='strict'):
207217
result = []
208218
for c in input_:
209219
try:
210-
result.append(regular_encode_dict[c])
220+
result.append(def_regular_encode_dict[c])
211221
except KeyError:
212-
if c in escape_encode_dict:
222+
if c in def_escape_encode_dict:
213223
# OK, let's encode it as an escaped characters
214224
result.append('\x1b')
215-
result.append(escape_encode_dict[c])
225+
result.append(def_escape_encode_dict[c])
216226
else:
217227
if errors == 'strict':
218-
raise UnicodeError("Invalid SMS character")
228+
raise UnicodeError("Invalid GSM character")
219229
elif errors == 'replace':
220-
result.append(QUESTION_MARK)
230+
result.append(
231+
def_replace_encode_dict.get(c, QUESTION_MARK))
221232
elif errors == 'ignore':
222233
pass
223234
else:
@@ -242,16 +253,16 @@ def decode(input_, errors='strict'):
242253
if index < len(input_):
243254
c = input_[index]
244255
index += 1
245-
result.append(escape_decode_dict.get(c, u'\xa0'))
256+
result.append(def_escape_decode_dict.get(c, u'\xa0'))
246257
else:
247258
result.append(u'\xa0')
248259
else:
249260
try:
250-
result.append(regular_decode_dict[c])
261+
result.append(def_regular_decode_dict[c])
251262
except KeyError:
252263
# error handling: unassigned byte, must be > 0x7f
253264
if errors == 'strict':
254-
raise UnicodeError("Unrecognized SMS character")
265+
raise UnicodeError("Unrecognized GSM character")
255266
elif errors == 'replace':
256267
result.append('?')
257268
elif errors == 'ignore':

0 commit comments

Comments
 (0)