Skip to content

Commit 97a34c2

Browse files
committed
Merge pull request #5051
aedc74d contrib: make linearize-data.py cope with out-of-order blocks (Wladimir J. van der Laan)
2 parents 6860a55 + aedc74d commit 97a34c2

File tree

2 files changed

+162
-98
lines changed

2 files changed

+162
-98
lines changed

contrib/linearize/example-linearize.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,5 @@ output_file=/home/example/Downloads/bootstrap.dat
1515
hashlist=hashlist.txt
1616
split_year=1
1717

18+
# Maxmimum size in bytes of out-of-order blocks cache in memory
19+
out_of_order_cache_sz = 100000000

contrib/linearize/linearize-data.py

Lines changed: 160 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
#
33
# linearize-data.py: Construct a linear, no-fork version of the chain.
44
#
5-
# Copyright (c) 2013 The Bitcoin developers
5+
# Copyright (c) 2013-2014 The Bitcoin developers
66
# Distributed under the MIT/X11 software license, see the accompanying
77
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
88
#
99

10+
from __future__ import print_function, division
1011
import json
1112
import struct
1213
import re
@@ -17,10 +18,10 @@
1718
import hashlib
1819
import datetime
1920
import time
21+
from collections import namedtuple
2022

2123
settings = {}
2224

23-
2425
def uint32(x):
2526
return x & 0xffffffffL
2627

@@ -78,116 +79,174 @@ def get_block_hashes(settings):
7879

7980
return blkindex
8081

81-
def mkblockset(blkindex):
82+
def mkblockmap(blkindex):
8283
blkmap = {}
83-
for hash in blkindex:
84-
blkmap[hash] = True
84+
for height,hash in enumerate(blkindex):
85+
blkmap[hash] = height
8586
return blkmap
8687

87-
def copydata(settings, blkindex, blkset):
88-
inFn = 0
89-
inF = None
90-
outFn = 0
91-
outsz = 0
92-
outF = None
93-
outFname = None
94-
blkCount = 0
95-
96-
lastDate = datetime.datetime(2000, 1, 1)
97-
highTS = 1408893517 - 315360000
98-
timestampSplit = False
99-
fileOutput = True
100-
setFileTime = False
101-
maxOutSz = settings['max_out_sz']
102-
if 'output' in settings:
103-
fileOutput = False
104-
if settings['file_timestamp'] != 0:
105-
setFileTime = True
106-
if settings['split_timestamp'] != 0:
107-
timestampSplit = True
108-
109-
while True:
110-
if not inF:
111-
fname = "%s/blk%05d.dat" % (settings['input'], inFn)
112-
print("Input file" + fname)
113-
try:
114-
inF = open(fname, "rb")
115-
except IOError:
116-
print "Done"
117-
return
118-
119-
inhdr = inF.read(8)
120-
if (not inhdr or (inhdr[0] == "\0")):
121-
inF.close()
122-
inF = None
123-
inFn = inFn + 1
124-
continue
125-
126-
inMagic = inhdr[:4]
127-
if (inMagic != settings['netmagic']):
128-
print("Invalid magic:" + inMagic)
129-
return
130-
inLenLE = inhdr[4:]
131-
su = struct.unpack("<I", inLenLE)
132-
inLen = su[0]
133-
rawblock = inF.read(inLen)
134-
blk_hdr = rawblock[:80]
135-
136-
hash_str = calc_hash_str(blk_hdr)
137-
if not hash_str in blkset:
138-
print("Skipping unknown block " + hash_str)
139-
continue
140-
141-
if blkindex[blkCount] != hash_str:
142-
print("Out of order block.")
143-
print("Expected " + blkindex[blkCount])
144-
print("Got " + hash_str)
145-
sys.exit(1)
146-
147-
if not fileOutput and ((outsz + inLen) > maxOutSz):
148-
outF.close()
149-
if setFileTime:
88+
# Block header and extent on disk
89+
BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
90+
91+
class BlockDataCopier:
92+
def __init__(self, settings, blkindex, blkmap):
93+
self.settings = settings
94+
self.blkindex = blkindex
95+
self.blkmap = blkmap
96+
97+
self.inFn = 0
98+
self.inF = None
99+
self.outFn = 0
100+
self.outsz = 0
101+
self.outF = None
102+
self.outFname = None
103+
self.blkCountIn = 0
104+
self.blkCountOut = 0
105+
106+
self.lastDate = datetime.datetime(2000, 1, 1)
107+
self.highTS = 1408893517 - 315360000
108+
self.timestampSplit = False
109+
self.fileOutput = True
110+
self.setFileTime = False
111+
self.maxOutSz = settings['max_out_sz']
112+
if 'output' in settings:
113+
self.fileOutput = False
114+
if settings['file_timestamp'] != 0:
115+
self.setFileTime = True
116+
if settings['split_timestamp'] != 0:
117+
self.timestampSplit = True
118+
# Extents and cache for out-of-order blocks
119+
self.blockExtents = {}
120+
self.outOfOrderData = {}
121+
self.outOfOrderSize = 0 # running total size for items in outOfOrderData
122+
123+
def writeBlock(self, inhdr, blk_hdr, rawblock):
124+
if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz):
125+
self.outF.close()
126+
if self.setFileTime:
150127
os.utime(outFname, (int(time.time()), highTS))
151-
outF = None
152-
outFname = None
153-
outFn = outFn + 1
154-
outsz = 0
128+
self.outF = None
129+
self.outFname = None
130+
self.outFn = outFn + 1
131+
self.outsz = 0
155132

156133
(blkDate, blkTS) = get_blk_dt(blk_hdr)
157-
if timestampSplit and (blkDate > lastDate):
134+
if self.timestampSplit and (blkDate > self.lastDate):
158135
print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
159136
lastDate = blkDate
160137
if outF:
161138
outF.close()
162139
if setFileTime:
163140
os.utime(outFname, (int(time.time()), highTS))
164-
outF = None
165-
outFname = None
166-
outFn = outFn + 1
167-
outsz = 0
168-
169-
if not outF:
170-
if fileOutput:
171-
outFname = settings['output_file']
141+
self.outF = None
142+
self.outFname = None
143+
self.outFn = self.outFn + 1
144+
self.outsz = 0
145+
146+
if not self.outF:
147+
if self.fileOutput:
148+
outFname = self.settings['output_file']
172149
else:
173-
outFname = "%s/blk%05d.dat" % (settings['output'], outFn)
150+
outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn)
174151
print("Output file" + outFname)
175-
outF = open(outFname, "wb")
176-
177-
outF.write(inhdr)
178-
outF.write(rawblock)
179-
outsz = outsz + inLen + 8
180-
181-
blkCount = blkCount + 1
182-
if blkTS > highTS:
183-
highTS = blkTS
184-
185-
if (blkCount % 1000) == 0:
186-
print("Wrote " + str(blkCount) + " blocks")
152+
self.outF = open(outFname, "wb")
153+
154+
self.outF.write(inhdr)
155+
self.outF.write(blk_hdr)
156+
self.outF.write(rawblock)
157+
self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
158+
159+
self.blkCountOut = self.blkCountOut + 1
160+
if blkTS > self.highTS:
161+
self.highTS = blkTS
162+
163+
if (self.blkCountOut % 1000) == 0:
164+
print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
165+
(self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
166+
167+
def inFileName(self, fn):
168+
return "%s/blk%05d.dat" % (self.settings['input'], fn)
169+
170+
def fetchBlock(self, extent):
171+
'''Fetch block contents from disk given extents'''
172+
with open(self.inFileName(extent.fn), "rb") as f:
173+
f.seek(extent.offset)
174+
return f.read(extent.size)
175+
176+
def copyOneBlock(self):
177+
'''Find the next block to be written in the input, and copy it to the output.'''
178+
extent = self.blockExtents.pop(self.blkCountOut)
179+
if self.blkCountOut in self.outOfOrderData:
180+
# If the data is cached, use it from memory and remove from the cache
181+
rawblock = self.outOfOrderData.pop(self.blkCountOut)
182+
self.outOfOrderSize -= len(rawblock)
183+
else: # Otherwise look up data on disk
184+
rawblock = self.fetchBlock(extent)
185+
186+
self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
187+
188+
def run(self):
189+
while self.blkCountOut < len(self.blkindex):
190+
if not self.inF:
191+
fname = self.inFileName(self.inFn)
192+
print("Input file" + fname)
193+
try:
194+
self.inF = open(fname, "rb")
195+
except IOError:
196+
print("Premature end of block data")
197+
return
198+
199+
inhdr = self.inF.read(8)
200+
if (not inhdr or (inhdr[0] == "\0")):
201+
self.inF.close()
202+
self.inF = None
203+
self.inFn = self.inFn + 1
204+
continue
205+
206+
inMagic = inhdr[:4]
207+
if (inMagic != self.settings['netmagic']):
208+
print("Invalid magic:" + inMagic)
209+
return
210+
inLenLE = inhdr[4:]
211+
su = struct.unpack("<I", inLenLE)
212+
inLen = su[0] - 80 # length without header
213+
blk_hdr = self.inF.read(80)
214+
inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
215+
216+
hash_str = calc_hash_str(blk_hdr)
217+
if not hash_str in blkmap:
218+
print("Skipping unknown block " + hash_str)
219+
self.inF.seek(inLen, os.SEEK_CUR)
220+
continue
221+
222+
blkHeight = self.blkmap[hash_str]
223+
self.blkCountIn += 1
224+
225+
if self.blkCountOut == blkHeight:
226+
# If in-order block, just copy
227+
rawblock = self.inF.read(inLen)
228+
self.writeBlock(inhdr, blk_hdr, rawblock)
229+
230+
# See if we can catch up to prior out-of-order blocks
231+
while self.blkCountOut in self.blockExtents:
232+
self.copyOneBlock()
233+
234+
else: # If out-of-order, skip over block data for now
235+
self.blockExtents[blkHeight] = inExtent
236+
if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
237+
# If there is space in the cache, read the data
238+
# Reading the data in file sequence instead of seeking and fetching it later is preferred,
239+
# but we don't want to fill up memory
240+
self.outOfOrderData[blkHeight] = self.inF.read(inLen)
241+
self.outOfOrderSize += inLen
242+
else: # If no space in cache, seek forward
243+
self.inF.seek(inLen, os.SEEK_CUR)
244+
245+
print("Done (%i blocks written)" % (self.blkCountOut))
187246

188247
if __name__ == '__main__':
189248
if len(sys.argv) != 2:
190-
print "Usage: linearize-data.py CONFIG-FILE"
249+
print("Usage: linearize-data.py CONFIG-FILE")
191250
sys.exit(1)
192251

193252
f = open(sys.argv[1])
@@ -216,22 +275,25 @@ def copydata(settings, blkindex, blkset):
216275
settings['split_timestamp'] = 0
217276
if 'max_out_sz' not in settings:
218277
settings['max_out_sz'] = 1000L * 1000 * 1000
278+
if 'out_of_order_cache_sz' not in settings:
279+
settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
219280

220281
settings['max_out_sz'] = long(settings['max_out_sz'])
221282
settings['split_timestamp'] = int(settings['split_timestamp'])
222283
settings['file_timestamp'] = int(settings['file_timestamp'])
223284
settings['netmagic'] = settings['netmagic'].decode('hex')
285+
settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
224286

225287
if 'output_file' not in settings and 'output' not in settings:
226288
print("Missing output file / directory")
227289
sys.exit(1)
228290

229291
blkindex = get_block_hashes(settings)
230-
blkset = mkblockset(blkindex)
292+
blkmap = mkblockmap(blkindex)
231293

232-
if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkset:
294+
if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkmap:
233295
print("not found")
234296
else:
235-
copydata(settings, blkindex, blkset)
297+
BlockDataCopier(settings, blkindex, blkmap).run()
236298

237299

0 commit comments

Comments
 (0)