Skip to content

Commit 494ed05

Browse files
authored
Handle XML namespaces in worksheets (#101)
* Use local_name to ignore namespaces when enumerating a sheet's rows. * Include XML namespace when constructing the shared strings dictionary * Respect the configured namespace when parsing a sheet's rows Instead of using local_name, which throws away the namespace prefix, identify the configured namespace prefix (if there is one) and use that when looking for nodes in the SAX parsing loop.
1 parent f8856ee commit 494ed05

File tree

7 files changed

+178
-8
lines changed

7 files changed

+178
-8
lines changed

lib/creek/shared_strings.rb

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ module Creek
55

66
class Creek::SharedStrings
77

8+
SPREADSHEETML_URI = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
9+
810
attr_reader :book, :dictionary
911

1012
def initialize book
@@ -27,9 +29,17 @@ def parse_shared_string_from_document(xml)
2729

2830
def self.parse_shared_string_from_document(xml)
2931
dictionary = Hash.new
30-
31-
xml.css('si').each_with_index do |si, idx|
32-
text_nodes = si.css('>t, r t')
32+
namespace = xml.namespaces.detect{|_key, uri| uri == SPREADSHEETML_URI }
33+
prefix = if namespace && namespace[0].start_with?('xmlns:')
34+
namespace[0].delete_prefix('xmlns:') + '|'
35+
else
36+
''
37+
end
38+
node_selector = "#{prefix}si"
39+
text_selector = ">#{prefix}t, #{prefix}r #{prefix}t"
40+
41+
xml.css(node_selector).each_with_index do |si, idx|
42+
text_nodes = si.css(text_selector)
3343
if text_nodes.count == 1 # plain text node
3444
dictionary[idx] = Creek::Styles::Converter.unescape_string(text_nodes.first.content)
3545
else # rich text nodes with text fragments

lib/creek/sheet.rb

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ class Creek::Sheet
88
include Creek::Utils
99

1010
HEADERS_ROW_NUMBER = '1'
11+
SPREADSHEETML_URI = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
1112

1213
attr_accessor :with_headers
1314
attr_reader :book,
@@ -97,13 +98,22 @@ def rows_generator include_meta_data=false, use_simple_rows_format=false
9798
cell_type = nil
9899
cell_style_idx = nil
99100
@book.files.file.open(path) do |xml|
101+
prefix = ''
100102
Nokogiri::XML::Reader.from_io(xml).each do |node|
101-
if node.name == 'row' && node.node_type == opener
103+
if prefix.empty? && node.namespaces.any?
104+
namespace = node.namespaces.detect{|_key, uri| uri == SPREADSHEETML_URI }
105+
prefix = if namespace && namespace[0].start_with?('xmlns:')
106+
namespace[0].delete_prefix('xmlns:') + ':'
107+
else
108+
''
109+
end
110+
end
111+
if node.name == "#{prefix}row" && node.node_type == opener
102112
row = node.attributes
103113
row['cells'] = {}
104114
cells = {}
105115
y << (include_meta_data ? row : cells) if node.self_closing?
106-
elsif node.name == 'row' && node.node_type == closer
116+
elsif node.name == "#{prefix}row" && node.node_type == closer
107117
processed_cells = fill_in_empty_cells(cells, row['r'], cell, use_simple_rows_format)
108118
@headers = processed_cells if with_headers && row['r'] == HEADERS_ROW_NUMBER
109119

@@ -117,11 +127,11 @@ def rows_generator include_meta_data=false, use_simple_rows_format=false
117127

118128
row['cells'] = processed_cells
119129
y << (include_meta_data ? row : processed_cells)
120-
elsif node.name == 'c' && node.node_type == opener
130+
elsif node.name == "#{prefix}c" && node.node_type == opener
121131
cell_type = node.attributes['t']
122132
cell_style_idx = node.attributes['s']
123133
cell = node.attributes['r']
124-
elsif %w[v t].include?(node.name) && node.node_type == opener
134+
elsif ["#{prefix}v", "#{prefix}t"].include?(node.name) && node.node_type == opener
125135
unless cell.nil?
126136
node.read
127137
cells[cell] = convert(node.value, cell_type, cell_style_idx)
8.54 KB
Binary file not shown.
8.2 KB
Binary file not shown.

spec/fixtures/sst_namespaced.xml

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
2+
<sst xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="6" uniqueCount="5">
3+
<x:si>
4+
<x:t>Cell A1</x:t>
5+
</x:si>
6+
<x:si>
7+
<x:t>Cell B1</x:t>
8+
</x:si>
9+
<x:si>
10+
<x:t>My Cell</x:t>
11+
</x:si>
12+
<x:si>
13+
<x:r>
14+
<x:rPr>
15+
<x:sz val="11"/>
16+
<x:color rgb="FFFF0000"/>
17+
<x:rFont val="Calibri"/>
18+
<x:family val="2"/>
19+
<x:scheme val="minor"/>
20+
</x:rPr>
21+
<x:t>Cell</x:t>
22+
</x:r>
23+
<x:r>
24+
<x:rPr>
25+
<x:sz val="11"/>
26+
<x:color theme="1"/>
27+
<x:rFont val="Calibri"/>
28+
<x:family val="2"/>
29+
<x:scheme val="minor"/>
30+
</x:rPr>
31+
<x:t xml:space="preserve"> </x:t>
32+
</x:r>
33+
<x:r>
34+
<x:rPr>
35+
<x:b/>
36+
<x:sz val="11"/>
37+
<x:color theme="1"/>
38+
<x:rFont val="Calibri"/>
39+
<x:family val="2"/>
40+
<x:scheme val="minor"/>
41+
</x:rPr>
42+
<x:t>A2</x:t>
43+
</x:r>
44+
</x:si>
45+
<x:si>
46+
<x:r>
47+
<x:rPr>
48+
<x:sz val="11"/>
49+
<x:color rgb="FF00B0F0"/>
50+
<x:rFont val="Calibri"/>
51+
<x:family val="2"/>
52+
<x:scheme val="minor"/>
53+
</x:rPr>
54+
<x:t>Cell</x:t>
55+
</x:r>
56+
<x:r>
57+
<x:rPr>
58+
<x:sz val="11"/>
59+
<x:color theme="1"/>
60+
<x:rFont val="Calibri"/>
61+
<x:family val="2"/>
62+
<x:scheme val="minor"/>
63+
</x:rPr>
64+
<x:t xml:space="preserve"> </x:t>
65+
</x:r>
66+
<x:r>
67+
<x:rPr>
68+
<x:i/>
69+
<x:sz val="11"/>
70+
<x:color theme="1"/>
71+
<x:rFont val="Calibri"/>
72+
<x:family val="2"/>
73+
<x:scheme val="minor"/>
74+
</x:rPr>
75+
<x:t>B2</x:t>
76+
</x:r>
77+
</x:si>
78+
<x:si>
79+
<x:t>Cell with_x000D_escaped_x000D_characters</x:t>
80+
</x:si>
81+
<x:si>
82+
<x:t>吉田兼好</x:t>
83+
<x:rPh sb="0" eb="2">
84+
<x:t xml:space="preserve">ヨシダ </x:t>
85+
</x:rPh>
86+
<x:rPh sb="2" eb="4">
87+
<x:t xml:space="preserve">ケンコウ </x:t>
88+
</x:rPh>
89+
<x:phoneticPr fontId="1"/>
90+
</x:si>
91+
</x:sst>

spec/shared_string_spec.rb

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,21 @@
1717
expect(dictionary[6]).to eq('吉田兼好')
1818
end
1919

20-
end
20+
context 'when the nodes are namespaced' do
21+
it 'parses the dictionary correctly' do
22+
shared_strings_xml_file = File.open('spec/fixtures/sst_namespaced.xml')
23+
doc = Nokogiri::XML(shared_strings_xml_file)
24+
dictionary = Creek::SharedStrings.parse_shared_string_from_document(doc)
25+
26+
expect(dictionary.keys.size).to eq(7)
27+
expect(dictionary[0]).to eq('Cell A1')
28+
expect(dictionary[1]).to eq('Cell B1')
29+
expect(dictionary[2]).to eq('My Cell')
30+
expect(dictionary[3]).to eq('Cell A2')
31+
expect(dictionary[4]).to eq('Cell B2')
32+
expect(dictionary[5]).to eq("Cell with\rescaped\rcharacters")
33+
expect(dictionary[6]).to eq('吉田兼好')
34+
end
35+
36+
end
37+
end

spec/sheet_spec.rb

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,16 @@ def load_cell(rows, cell_name)
7474
expect(load_cell(rows, 'A10')).to eq(0.15)
7575
end
7676
end
77+
78+
context 'when nodes are namespaced' do
79+
let(:namespaced_book) { Creek::Book.new('spec/fixtures/sample_namespaced.xlsx') }
80+
let(:namespaced_sheet) { Creek::Sheet.new(namespaced_book, 'Sheet 1', 1, '', '', '1', sheetfile) }
81+
82+
it 'parses rows correctly' do
83+
rows = namespaced_sheet.rows.map { |r| r }
84+
expect(load_cell(rows, 'A10')).to eq(0.15)
85+
end
86+
end
7787
end
7888

7989
describe '#images_at' do
@@ -130,5 +140,37 @@ def load_cell(rows, cell_name)
130140
expect(row['HeaderC']).to eq 'value3'
131141
end
132142
end
143+
144+
context 'when nodes are namespaced' do
145+
let(:namespaced_book) { Creek::Book.new('spec/fixtures/sample-with-headers_namespaced.xlsx') }
146+
let(:sheet) { Creek::Sheet.new(namespaced_book, 'Sheet 1', 1, '', '', '1', sheetfile) }
147+
148+
it 'returns values by letters' do
149+
expect(subject['A']).to eq 'value1'
150+
expect(subject['B']).to eq 'value2'
151+
end
152+
153+
context 'when enable with_headers property' do
154+
before { sheet.with_headers = true }
155+
156+
it 'returns values by headers name' do
157+
expect(subject['HeaderA']).to eq 'value1'
158+
expect(subject['HeaderB']).to eq 'value2'
159+
expect(subject['HeaderC']).to eq 'value3'
160+
end
161+
162+
it 'returns headers correctly when called multiple times' do
163+
row = sheet.simple_rows.to_a[1]
164+
expect(row['HeaderA']).to eq 'value1'
165+
expect(row['HeaderB']).to eq 'value2'
166+
expect(row['HeaderC']).to eq 'value3'
167+
168+
row = sheet.simple_rows.to_a[1]
169+
expect(row['HeaderA']).to eq 'value1'
170+
expect(row['HeaderB']).to eq 'value2'
171+
expect(row['HeaderC']).to eq 'value3'
172+
end
173+
end
174+
end
133175
end
134176
end

0 commit comments

Comments
 (0)