|
2 | 2 | #
|
3 | 3 | # linearize-data.py: Construct a linear, no-fork version of the chain.
|
4 | 4 | #
|
5 |
| -# Copyright (c) 2013 The Bitcoin developers |
| 5 | +# Copyright (c) 2013-2014 The Bitcoin developers |
6 | 6 | # Distributed under the MIT/X11 software license, see the accompanying
|
7 | 7 | # file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
8 | 8 | #
|
9 | 9 |
|
| 10 | +from __future__ import print_function, division |
10 | 11 | import json
|
11 | 12 | import struct
|
12 | 13 | import re
|
|
17 | 18 | import hashlib
|
18 | 19 | import datetime
|
19 | 20 | import time
|
| 21 | +from collections import namedtuple |
20 | 22 |
|
21 | 23 | settings = {}
|
22 | 24 |
|
23 |
| - |
24 | 25 | def uint32(x):
|
25 | 26 | return x & 0xffffffffL
|
26 | 27 |
|
@@ -78,116 +79,174 @@ def get_block_hashes(settings):
|
78 | 79 |
|
79 | 80 | return blkindex
|
80 | 81 |
|
81 |
| -def mkblockset(blkindex): |
| 82 | +def mkblockmap(blkindex): |
82 | 83 | blkmap = {}
|
83 |
| - for hash in blkindex: |
84 |
| - blkmap[hash] = True |
| 84 | + for height,hash in enumerate(blkindex): |
| 85 | + blkmap[hash] = height |
85 | 86 | return blkmap
|
86 | 87 |
|
87 |
| -def copydata(settings, blkindex, blkset): |
88 |
| - inFn = 0 |
89 |
| - inF = None |
90 |
| - outFn = 0 |
91 |
| - outsz = 0 |
92 |
| - outF = None |
93 |
| - outFname = None |
94 |
| - blkCount = 0 |
95 |
| - |
96 |
| - lastDate = datetime.datetime(2000, 1, 1) |
97 |
| - highTS = 1408893517 - 315360000 |
98 |
| - timestampSplit = False |
99 |
| - fileOutput = True |
100 |
| - setFileTime = False |
101 |
| - maxOutSz = settings['max_out_sz'] |
102 |
| - if 'output' in settings: |
103 |
| - fileOutput = False |
104 |
| - if settings['file_timestamp'] != 0: |
105 |
| - setFileTime = True |
106 |
| - if settings['split_timestamp'] != 0: |
107 |
| - timestampSplit = True |
108 |
| - |
109 |
| - while True: |
110 |
| - if not inF: |
111 |
| - fname = "%s/blk%05d.dat" % (settings['input'], inFn) |
112 |
| - print("Input file" + fname) |
113 |
| - try: |
114 |
| - inF = open(fname, "rb") |
115 |
| - except IOError: |
116 |
| - print "Done" |
117 |
| - return |
118 |
| - |
119 |
| - inhdr = inF.read(8) |
120 |
| - if (not inhdr or (inhdr[0] == "\0")): |
121 |
| - inF.close() |
122 |
| - inF = None |
123 |
| - inFn = inFn + 1 |
124 |
| - continue |
125 |
| - |
126 |
| - inMagic = inhdr[:4] |
127 |
| - if (inMagic != settings['netmagic']): |
128 |
| - print("Invalid magic:" + inMagic) |
129 |
| - return |
130 |
| - inLenLE = inhdr[4:] |
131 |
| - su = struct.unpack("<I", inLenLE) |
132 |
| - inLen = su[0] |
133 |
| - rawblock = inF.read(inLen) |
134 |
| - blk_hdr = rawblock[:80] |
135 |
| - |
136 |
| - hash_str = calc_hash_str(blk_hdr) |
137 |
| - if not hash_str in blkset: |
138 |
| - print("Skipping unknown block " + hash_str) |
139 |
| - continue |
140 |
| - |
141 |
| - if blkindex[blkCount] != hash_str: |
142 |
| - print("Out of order block.") |
143 |
| - print("Expected " + blkindex[blkCount]) |
144 |
| - print("Got " + hash_str) |
145 |
| - sys.exit(1) |
146 |
| - |
147 |
| - if not fileOutput and ((outsz + inLen) > maxOutSz): |
148 |
| - outF.close() |
149 |
| - if setFileTime: |
| 88 | +# Block header and extent on disk |
| 89 | +BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size']) |
| 90 | + |
| 91 | +class BlockDataCopier: |
| 92 | + def __init__(self, settings, blkindex, blkmap): |
| 93 | + self.settings = settings |
| 94 | + self.blkindex = blkindex |
| 95 | + self.blkmap = blkmap |
| 96 | + |
| 97 | + self.inFn = 0 |
| 98 | + self.inF = None |
| 99 | + self.outFn = 0 |
| 100 | + self.outsz = 0 |
| 101 | + self.outF = None |
| 102 | + self.outFname = None |
| 103 | + self.blkCountIn = 0 |
| 104 | + self.blkCountOut = 0 |
| 105 | + |
| 106 | + self.lastDate = datetime.datetime(2000, 1, 1) |
| 107 | + self.highTS = 1408893517 - 315360000 |
| 108 | + self.timestampSplit = False |
| 109 | + self.fileOutput = True |
| 110 | + self.setFileTime = False |
| 111 | + self.maxOutSz = settings['max_out_sz'] |
| 112 | + if 'output' in settings: |
| 113 | + self.fileOutput = False |
| 114 | + if settings['file_timestamp'] != 0: |
| 115 | + self.setFileTime = True |
| 116 | + if settings['split_timestamp'] != 0: |
| 117 | + self.timestampSplit = True |
| 118 | + # Extents and cache for out-of-order blocks |
| 119 | + self.blockExtents = {} |
| 120 | + self.outOfOrderData = {} |
| 121 | + self.outOfOrderSize = 0 # running total size for items in outOfOrderData |
| 122 | + |
| 123 | + def writeBlock(self, inhdr, blk_hdr, rawblock): |
| 124 | + if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz): |
| 125 | + self.outF.close() |
| 126 | + if self.setFileTime: |
150 | 127 | os.utime(outFname, (int(time.time()), highTS))
|
151 |
| - outF = None |
152 |
| - outFname = None |
153 |
| - outFn = outFn + 1 |
154 |
| - outsz = 0 |
| 128 | + self.outF = None |
| 129 | + self.outFname = None |
| 130 | + self.outFn = outFn + 1 |
| 131 | + self.outsz = 0 |
155 | 132 |
|
156 | 133 | (blkDate, blkTS) = get_blk_dt(blk_hdr)
|
157 |
| - if timestampSplit and (blkDate > lastDate): |
| 134 | + if self.timestampSplit and (blkDate > self.lastDate): |
158 | 135 | print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
|
159 | 136 | lastDate = blkDate
|
160 | 137 | if outF:
|
161 | 138 | outF.close()
|
162 | 139 | if setFileTime:
|
163 | 140 | os.utime(outFname, (int(time.time()), highTS))
|
164 |
| - outF = None |
165 |
| - outFname = None |
166 |
| - outFn = outFn + 1 |
167 |
| - outsz = 0 |
168 |
| - |
169 |
| - if not outF: |
170 |
| - if fileOutput: |
171 |
| - outFname = settings['output_file'] |
| 141 | + self.outF = None |
| 142 | + self.outFname = None |
| 143 | + self.outFn = self.outFn + 1 |
| 144 | + self.outsz = 0 |
| 145 | + |
| 146 | + if not self.outF: |
| 147 | + if self.fileOutput: |
| 148 | + outFname = self.settings['output_file'] |
172 | 149 | else:
|
173 |
| - outFname = "%s/blk%05d.dat" % (settings['output'], outFn) |
| 150 | + outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn) |
174 | 151 | print("Output file" + outFname)
|
175 |
| - outF = open(outFname, "wb") |
176 |
| - |
177 |
| - outF.write(inhdr) |
178 |
| - outF.write(rawblock) |
179 |
| - outsz = outsz + inLen + 8 |
180 |
| - |
181 |
| - blkCount = blkCount + 1 |
182 |
| - if blkTS > highTS: |
183 |
| - highTS = blkTS |
184 |
| - |
185 |
| - if (blkCount % 1000) == 0: |
186 |
| - print("Wrote " + str(blkCount) + " blocks") |
| 152 | + self.outF = open(outFname, "wb") |
| 153 | + |
| 154 | + self.outF.write(inhdr) |
| 155 | + self.outF.write(blk_hdr) |
| 156 | + self.outF.write(rawblock) |
| 157 | + self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) |
| 158 | + |
| 159 | + self.blkCountOut = self.blkCountOut + 1 |
| 160 | + if blkTS > self.highTS: |
| 161 | + self.highTS = blkTS |
| 162 | + |
| 163 | + if (self.blkCountOut % 1000) == 0: |
| 164 | + print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % |
| 165 | + (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex))) |
| 166 | + |
| 167 | + def inFileName(self, fn): |
| 168 | + return "%s/blk%05d.dat" % (self.settings['input'], fn) |
| 169 | + |
| 170 | + def fetchBlock(self, extent): |
| 171 | + '''Fetch block contents from disk given extents''' |
| 172 | + with open(self.inFileName(extent.fn), "rb") as f: |
| 173 | + f.seek(extent.offset) |
| 174 | + return f.read(extent.size) |
| 175 | + |
| 176 | + def copyOneBlock(self): |
| 177 | + '''Find the next block to be written in the input, and copy it to the output.''' |
| 178 | + extent = self.blockExtents.pop(self.blkCountOut) |
| 179 | + if self.blkCountOut in self.outOfOrderData: |
| 180 | + # If the data is cached, use it from memory and remove from the cache |
| 181 | + rawblock = self.outOfOrderData.pop(self.blkCountOut) |
| 182 | + self.outOfOrderSize -= len(rawblock) |
| 183 | + else: # Otherwise look up data on disk |
| 184 | + rawblock = self.fetchBlock(extent) |
| 185 | + |
| 186 | + self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) |
| 187 | + |
| 188 | + def run(self): |
| 189 | + while self.blkCountOut < len(self.blkindex): |
| 190 | + if not self.inF: |
| 191 | + fname = self.inFileName(self.inFn) |
| 192 | + print("Input file" + fname) |
| 193 | + try: |
| 194 | + self.inF = open(fname, "rb") |
| 195 | + except IOError: |
| 196 | + print("Premature end of block data") |
| 197 | + return |
| 198 | + |
| 199 | + inhdr = self.inF.read(8) |
| 200 | + if (not inhdr or (inhdr[0] == "\0")): |
| 201 | + self.inF.close() |
| 202 | + self.inF = None |
| 203 | + self.inFn = self.inFn + 1 |
| 204 | + continue |
| 205 | + |
| 206 | + inMagic = inhdr[:4] |
| 207 | + if (inMagic != self.settings['netmagic']): |
| 208 | + print("Invalid magic:" + inMagic) |
| 209 | + return |
| 210 | + inLenLE = inhdr[4:] |
| 211 | + su = struct.unpack("<I", inLenLE) |
| 212 | + inLen = su[0] - 80 # length without header |
| 213 | + blk_hdr = self.inF.read(80) |
| 214 | + inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) |
| 215 | + |
| 216 | + hash_str = calc_hash_str(blk_hdr) |
| 217 | + if not hash_str in blkmap: |
| 218 | + print("Skipping unknown block " + hash_str) |
| 219 | + self.inF.seek(inLen, os.SEEK_CUR) |
| 220 | + continue |
| 221 | + |
| 222 | + blkHeight = self.blkmap[hash_str] |
| 223 | + self.blkCountIn += 1 |
| 224 | + |
| 225 | + if self.blkCountOut == blkHeight: |
| 226 | + # If in-order block, just copy |
| 227 | + rawblock = self.inF.read(inLen) |
| 228 | + self.writeBlock(inhdr, blk_hdr, rawblock) |
| 229 | + |
| 230 | + # See if we can catch up to prior out-of-order blocks |
| 231 | + while self.blkCountOut in self.blockExtents: |
| 232 | + self.copyOneBlock() |
| 233 | + |
| 234 | + else: # If out-of-order, skip over block data for now |
| 235 | + self.blockExtents[blkHeight] = inExtent |
| 236 | + if self.outOfOrderSize < self.settings['out_of_order_cache_sz']: |
| 237 | + # If there is space in the cache, read the data |
| 238 | + # Reading the data in file sequence instead of seeking and fetching it later is preferred, |
| 239 | + # but we don't want to fill up memory |
| 240 | + self.outOfOrderData[blkHeight] = self.inF.read(inLen) |
| 241 | + self.outOfOrderSize += inLen |
| 242 | + else: # If no space in cache, seek forward |
| 243 | + self.inF.seek(inLen, os.SEEK_CUR) |
| 244 | + |
| 245 | + print("Done (%i blocks written)" % (self.blkCountOut)) |
187 | 246 |
|
188 | 247 | if __name__ == '__main__':
|
189 | 248 | if len(sys.argv) != 2:
|
190 |
| - print "Usage: linearize-data.py CONFIG-FILE" |
| 249 | + print("Usage: linearize-data.py CONFIG-FILE") |
191 | 250 | sys.exit(1)
|
192 | 251 |
|
193 | 252 | f = open(sys.argv[1])
|
@@ -216,22 +275,25 @@ def copydata(settings, blkindex, blkset):
|
216 | 275 | settings['split_timestamp'] = 0
|
217 | 276 | if 'max_out_sz' not in settings:
|
218 | 277 | settings['max_out_sz'] = 1000L * 1000 * 1000
|
| 278 | + if 'out_of_order_cache_sz' not in settings: |
| 279 | + settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 |
219 | 280 |
|
220 | 281 | settings['max_out_sz'] = long(settings['max_out_sz'])
|
221 | 282 | settings['split_timestamp'] = int(settings['split_timestamp'])
|
222 | 283 | settings['file_timestamp'] = int(settings['file_timestamp'])
|
223 | 284 | settings['netmagic'] = settings['netmagic'].decode('hex')
|
| 285 | + settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) |
224 | 286 |
|
225 | 287 | if 'output_file' not in settings and 'output' not in settings:
|
226 | 288 | print("Missing output file / directory")
|
227 | 289 | sys.exit(1)
|
228 | 290 |
|
229 | 291 | blkindex = get_block_hashes(settings)
|
230 |
| - blkset = mkblockset(blkindex) |
| 292 | + blkmap = mkblockmap(blkindex) |
231 | 293 |
|
232 |
| - if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkset: |
| 294 | + if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkmap: |
233 | 295 | print("not found")
|
234 | 296 | else:
|
235 |
| - copydata(settings, blkindex, blkset) |
| 297 | + BlockDataCopier(settings, blkindex, blkmap).run() |
236 | 298 |
|
237 | 299 |
|
0 commit comments