33
44#include < Rcpp.h>
55#include " rapidxml.h"
6+ #include < R_ext/GraphicsDevice.h> // Rf_ucstoutf8 is exported in R_ext/GraphicsDevice.h
7+
8+ // unescape an ST_Xstring. See 22.9.2.19 [p3786]
9+ inline std::string unescape (const std::string &s) {
10+ std::string out;
11+ out.reserve (s.size ());
12+
13+ for (size_t i = 0 ; i < s.size (); i++) {
14+ if (i+6 < s.size () && s[i] == ' _' && s[i+1 ] == ' x'
15+ && isxdigit (s[i+2 ]) && isxdigit (s[i+3 ])
16+ && isxdigit (s[i+4 ]) && isxdigit (s[i+5 ]) && s[i+6 ] == ' _' ) {
17+ // extract character
18+ unsigned int ch = strtoul (&s[i+2 ], NULL , 16 );
19+ char utf8[16 ]; // 16 from definition of Rf_ucstoutf8
20+ Rf_ucstoutf8 (utf8, ch);
21+ out += utf8;
22+ i += 6 ; // skip to the final '_'
23+ } else {
24+ out.push_back (s[i]);
25+ }
26+ }
27+
28+ return out;
29+ }
630
731// Parser for <si> and <is> inlineStr tags CT_Rst [p3893]
832// returns true if a string is found, false if missing.
@@ -30,7 +54,7 @@ inline bool parseString(const rapidxml::xml_node<>* string, std::string *out) {
3054 //
3155 // We read the <t> tag, if present, first, then concatenate any <r> tags.
3256 // All Excel 2010 sheets will read correctly under this regime.
33- *out = std::string (t->value ());
57+ *out = unescape (t->value ());
3458 found = true ;
3559 }
3660 // iterate over all r elements
@@ -40,7 +64,7 @@ inline bool parseString(const rapidxml::xml_node<>* string, std::string *out) {
4064 // but MacOSX preview just ignores chunks with no t element present
4165 const rapidxml::xml_node<>* t = r->first_node (" t" );
4266 if (t != NULL ) {
43- *out += t->value ();
67+ *out += unescape ( t->value () );
4468 found = true ;
4569 }
4670 }
0 commit comments