Created
March 17, 2014 22:49
-
-
Save acdha/9610005 to your computer and use it in GitHub Desktop.
Fragment of code used to process images with Tesseract OCR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ocr_file(filename, languages, output_base, temp_dir): | |
log.info("Launching tesseract on %s", filename) | |
output = subprocess.check_output(['tesseract', filename, output_base, | |
'-l', '+'.join(languages), TESSERACT_CONFIG], | |
cwd=temp_dir, | |
stderr=subprocess.STDOUT) | |
with OCR_STORAGE.open('%s/%s/%s.log' % (item_id, group, index), 'w') as log_f: | |
log_f.write(output) | |
log.info("Processing hOCR output") | |
hocr_file = os.path.join(temp_dir, '%s.html' % output_base) | |
with open(hocr_file, 'rb') as f: | |
hocr_bytes = f.read() | |
OCR_STORAGE.save('%s/%s/%s.html.bz2' % (item_id, group, index), | |
ContentFile(bz2.compress(hocr_bytes))) | |
log.info("Extracting plain text") | |
# Kludge around https://bugs.launchpad.net/ubuntu/+source/tesseract/+bug/1094145 | |
html = lxml.html.document_fromstring(hocr_bytes.decode("utf-8", "replace").encode("utf-8"), | |
parser=UTF8_PARSER) | |
# Extract the text for Solr: | |
text = [] | |
for p in html.cssselect('p'): | |
text.append(u" ".join(i.text for i in p.iterdescendants() if i.text).strip()) | |
text = u"\n\n".join(filter(None, text)) | |
OCR_STORAGE.save('%s/%s/%s.txt.bz2' % (item_id, group, index), | |
ContentFile(bz2.compress(text.encode("utf-8")))) | |
log.info("Extracting word coordinates") | |
pages = html.cssselect('.ocr_page') | |
assert len(pages) == 1 | |
page_elem = pages[0] | |
page_info = [i.strip() for i in page_elem.attrib['title'].split(";")] | |
for i in page_info: | |
if i.startswith('bbox'): | |
page_bbox = map(int, i.split()[1:5]) | |
break | |
else: | |
LOGGER.warning('Page did not contain bounding box information - no word coordinates!') | |
return | |
assert page_bbox[0] == 0 | |
assert page_bbox[1] == 0 | |
page_width = page_bbox[2] | |
page_height = page_bbox[3] | |
word_coords = defaultdict(list) | |
for i in html.cssselect('.ocrx_word,.ocr_word'): | |
term = inner_text(i) | |
bbox = i.attrib['title'].split() | |
assert bbox[0] == 'bbox' | |
word_coords[term].append(map(int, bbox[1:5])) | |
coordinates = {"height": page_height, "width": page_width, | |
"words": word_coords} | |
coord_file = "%s/%s/%s.word_coordinates.json.bz2" % (item_id, group, index) | |
coord_data = bz2.compress(simplejson.dumps(coordinates)) | |
OCR_STORAGE.save(coord_file, ContentFile(coord_data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment