Skip to content

Commit 40c4bee

Browse files
committed
string_decoder: added support for UTF-16LE
Fixes nodejs#3223.
1 parent 5871c81 commit 40c4bee

File tree

2 files changed

+99
-40
lines changed

2 files changed

+99
-40
lines changed

lib/string_decoder.js

Lines changed: 63 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -21,22 +21,32 @@
2121

2222
var StringDecoder = exports.StringDecoder = function(encoding) {
2323
this.encoding = (encoding || 'utf8').toLowerCase().replace(/[-_]/, '');
24-
if (this.encoding === 'utf8') {
25-
this.charBuffer = new Buffer(6);
26-
this.charReceived = 0;
27-
this.charLength = 0;
24+
switch (this.encoding) {
25+
case 'utf8':
26+
// CESU-8 represents each of Surrogate Pair by 3-bytes
27+
this.surrogateSize = 3;
28+
break;
29+
case 'ucs2':
30+
case 'utf16le':
31+
// UTF-16 represents each of Surrogate Pair by 2-bytes
32+
this.surrogateSize = 2;
33+
this.detectIncompleteChar = utf16DetectIncompleteChar;
34+
break;
35+
default:
36+
this.write = passThroughWrite;
37+
return;
2838
}
39+
40+
this.charBuffer = new Buffer(6);
41+
this.charReceived = 0;
42+
this.charLength = 0;
2943
};
3044

3145

3246
StringDecoder.prototype.write = function(buffer) {
33-
// If not utf8...
34-
if (this.encoding !== 'utf8') {
35-
return buffer.toString(this.encoding);
36-
}
37-
3847
var charStr = '';
3948
var offset = 0;
49+
4050
// if our last write ended with an incomplete multibyte character
4151
while (this.charLength) {
4252
// determine how many remaining bytes this buffer has to offer for this char
@@ -55,16 +65,14 @@ StringDecoder.prototype.write = function(buffer) {
5565
}
5666

5767
// get the character that was split
58-
charStr = this.charBuffer.slice(0, this.charLength).toString();
68+
charStr = this.charBuffer.slice(0, this.charLength).toString(this.encoding);
5969

6070
// lead surrogate (D800-DBFF) is also the incomplete character
61-
if (this.charLength === 3) {
62-
var charCode = charStr.charCodeAt(0);
63-
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
64-
charStr = '';
65-
this.charLength += 3; // size of trail surrogate (DC00-DFFF)
66-
continue;
67-
}
71+
var charCode = charStr.charCodeAt(charStr.length - 1);
72+
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
73+
this.charLength += this.surrogateSize;
74+
charStr = '';
75+
continue;
6876
}
6977
this.charReceived = this.charLength = 0;
7078

@@ -76,7 +84,35 @@ StringDecoder.prototype.write = function(buffer) {
7684
break;
7785
}
7886

87+
var lenIncomplete = this.detectIncompleteChar(buffer);
88+
89+
var end = buffer.length;
90+
if (this.charLength) {
91+
// buffer the incomplete character bytes we got
92+
buffer.copy(this.charBuffer, 0, buffer.length - lenIncomplete, end);
93+
this.charReceived = lenIncomplete;
94+
end -= lenIncomplete;
95+
}
96+
97+
charStr += buffer.toString(this.encoding, 0, end);
98+
99+
var end = charStr.length - 1;
100+
var charCode = charStr.charCodeAt(end);
101+
// lead surrogate (D800-DBFF) is also the incomplete character
102+
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
103+
var size = this.surrogateSize;
104+
this.charLength += size;
105+
this.charReceived += size;
106+
this.charBuffer.copy(this.charBuffer, size, 0, size);
107+
this.charBuffer.write(charStr.charAt(charStr.length - 1), this.encoding);
108+
return charStr.substring(0, end);
109+
}
110+
111+
// or just emit the charStr
112+
return charStr;
113+
};
79114

115+
StringDecoder.prototype.detectIncompleteChar = function(buffer) {
80116
// determine how many bytes we have to check at the end of this buffer
81117
var i = (buffer.length >= 3) ? 3 : buffer.length;
82118

@@ -106,28 +142,15 @@ StringDecoder.prototype.write = function(buffer) {
106142
}
107143
}
108144

109-
var end = buffer.length;
110-
if (this.charLength) {
111-
// buffer the incomplete character bytes we got
112-
buffer.copy(this.charBuffer, 0, buffer.length - i, buffer.length);
113-
this.charReceived = i;
114-
end -= i;
115-
}
116-
117-
charStr += buffer.toString('utf8', 0, end);
145+
return i;
146+
};
118147

119-
// lead surrogate (D800-DBFF) is also the incomplete character
120-
end = charStr.length - 1;
121-
var charCode = charStr.charCodeAt(end);
122-
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
123-
// CESU-8 represents each of Surrogate Pair by 3-bytes
124-
this.charLength += 3
125-
this.charReceived += 3
126-
this.charBuffer.copy(this.charBuffer, 3, 0, 3);
127-
this.charBuffer.write(charStr.charAt(end));
128-
return charStr.substring(0, end);
129-
}
148+
function passThroughWrite(buffer) {
149+
return buffer.toString(this.encoding);
150+
}
130151

131-
// or just emit the charStr
132-
return charStr;
133-
};
152+
function utf16DetectIncompleteChar(buffer) {
153+
var incomplete = this.charReceived = buffer.length % 2;
154+
this.charLength = incomplete ? 2 : 0;
155+
return incomplete;
156+
}

test/simple/test-string-decoder.js

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,42 @@ s += decoder.write(buffer.slice(0, 6));
8989
assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16)
9090

9191

92+
// UCS-2
93+
decoder = new StringDecoder('ucs2');
94+
buffer = new Buffer('ab', 'ucs2');
95+
assert.equal(decoder.write(buffer), 'ab'); // 2 complete chars
96+
buffer = new Buffer('abc', 'ucs2');
97+
assert.equal(decoder.write(buffer.slice(0, 3)), 'a'); // 'a' and first of 'b'
98+
assert.equal(decoder.write(buffer.slice(3, 6)), 'bc'); // second of 'b' and 'c'
99+
100+
101+
// UTF-16LE
102+
buffer = new Buffer('3DD84DDC', 'hex'); // THUMBS UP SIGN (in CESU-8)
103+
var s = '';
104+
s += decoder.write(buffer.slice(0, 1));
105+
s += decoder.write(buffer.slice(1, 2)); // complete lead surrogate
106+
assert.equal(s, '');
107+
s += decoder.write(buffer.slice(2, 3));
108+
s += decoder.write(buffer.slice(3, 4)); // complete trail surrogate
109+
assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16)
110+
111+
var s = '';
112+
s += decoder.write(buffer.slice(0, 2)); // complete lead surrogate
113+
assert.equal(s, '');
114+
s += decoder.write(buffer.slice(2, 4)); // complete trail surrogate
115+
assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16)
116+
117+
var s = '';
118+
s += decoder.write(buffer.slice(0, 3)); // complete lead surrogate
119+
assert.equal(s, '');
120+
s += decoder.write(buffer.slice(3, 4)); // complete trail surrogate
121+
assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16)
122+
123+
var s = '';
124+
s += decoder.write(buffer.slice(0, 4));
125+
assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16)
126+
127+
92128
// A mixed ascii and non-ascii string
93129
// Test stolen from deps/v8/test/cctest/test-strings.cc
94130
// U+02E4 -> CB A4

0 commit comments

Comments
 (0)