Skip to content

Commit c58957b

Browse files
author
Jonathan Marshall
committed
fixes .xlsx files that contain unicode character expressions such as _x000D_. Closes #51
1 parent 9e390ce commit c58957b

File tree

1 file changed

+26
-2
lines changed

1 file changed

+26
-2
lines changed

src/XlsxString.h

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,30 @@
33

44
#include <Rcpp.h>
55
#include "rapidxml.h"
6+
#include <R_ext/GraphicsDevice.h> // Rf_ucstoutf8 is exported in R_ext/GraphicsDevice.h
7+
8+
// unescape an ST_Xstring. See 22.9.2.19 [p3786]
9+
inline std::string unescape(const std::string &s) {
10+
std::string out;
11+
out.reserve(s.size());
12+
13+
for (size_t i = 0; i < s.size(); i++) {
14+
if (i+6 < s.size() && s[i] == '_' && s[i+1] == 'x'
15+
&& isxdigit(s[i+2]) && isxdigit(s[i+3])
16+
&& isxdigit(s[i+4]) && isxdigit(s[i+5]) && s[i+6] == '_') {
17+
// extract character
18+
unsigned int ch = strtoul(&s[i+2], NULL, 16);
19+
char utf8[16]; // 16 from definition of Rf_ucstoutf8
20+
Rf_ucstoutf8(utf8, ch);
21+
out += utf8;
22+
i += 6; // skip to the final '_'
23+
} else {
24+
out.push_back(s[i]);
25+
}
26+
}
27+
28+
return out;
29+
}
630

731
// Parser for <si> and <is> inlineStr tags CT_Rst [p3893]
832
// returns true if a string is found, false if missing.
@@ -30,7 +54,7 @@ inline bool parseString(const rapidxml::xml_node<>* string, std::string *out) {
3054
//
3155
// We read the <t> tag, if present, first, then concatenate any <r> tags.
3256
// All Excel 2010 sheets will read correctly under this regime.
33-
*out = std::string(t->value());
57+
*out = unescape(t->value());
3458
found = true;
3559
}
3660
// iterate over all r elements
@@ -40,7 +64,7 @@ inline bool parseString(const rapidxml::xml_node<>* string, std::string *out) {
4064
// but MacOSX preview just ignores chunks with no t element present
4165
const rapidxml::xml_node<>* t = r->first_node("t");
4266
if (t != NULL) {
43-
*out += t->value();
67+
*out += unescape(t->value());
4468
found = true;
4569
}
4670
}

0 commit comments

Comments
 (0)