forked from VikParuchuri/marker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
marker_app.py
137 lines (103 loc) · 3.85 KB
/
marker_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["IN_STREAMLIT"] = "true"
os.environ["PDFTEXT_CPU_WORKERS"] = "1"
import base64
import io
import re
import tempfile
from typing import Any, Dict, List
import pypdfium2
import streamlit as st
from surya.languages import CODE_TO_LANGUAGE
from marker.convert import convert_single_pdf
from marker.models import load_all_models
@st.cache_resource()
def load_models():
return load_all_models()
def convert_pdf(
fname: str, langs: List[str] | None, max_pages: int | None, start_page: int | None, ocr_all_pages: bool
) -> (str, Dict[str, Any], dict):
full_text, images, out_meta = convert_single_pdf(
fname, model_lst, max_pages=max_pages, langs=langs, start_page=start_page, ocr_all_pages=ocr_all_pages
)
return full_text, images, out_meta
def open_pdf(pdf_file):
stream = io.BytesIO(pdf_file.getvalue())
return pypdfium2.PdfDocument(stream)
def img_to_html(img, img_alt):
img_bytes = io.BytesIO()
img.save(img_bytes, format="PNG")
img_bytes = img_bytes.getvalue()
encoded = base64.b64encode(img_bytes).decode()
img_html = f'<img src="data:image/png;base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
return img_html
def markdown_insert_images(markdown, images):
image_tags = re.findall(r'(!\[(?P<image_title>[^\]]+)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
for image in image_tags:
image_markdown = image[0]
image_alt = image[1]
image_path = image[2]
if image_path in images:
markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt))
return markdown
@st.cache_data()
def get_page_image(pdf_file, page_num, dpi=96):
doc = open_pdf(pdf_file)
renderer = doc.render(
pypdfium2.PdfBitmap.to_pil,
page_indices=[page_num - 1],
scale=dpi / 72,
)
png = list(renderer)[0]
png_image = png.convert("RGB")
return png_image
@st.cache_data()
def page_count(pdf_file):
doc = open_pdf(pdf_file)
return len(doc)
st.set_page_config(layout="wide")
col1, col2 = st.columns([0.5, 0.5])
model_lst = load_models()
st.markdown(
"""
# Marker Demo
This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc.
Find the project [here](https://github.com/VikParuchuri/marker).
"""
)
in_file = st.sidebar.file_uploader("PDF file:", type=["pdf"])
languages = st.sidebar.multiselect(
"Languages",
sorted(list(CODE_TO_LANGUAGE.values())),
default=[],
max_selections=4,
help="Select the languages in the pdf (if known) to improve OCR accuracy. Optional.",
)
start_page = st.sidebar.number_input("Start page", min_value=1, value=1, help="Optional start page number")
max_pages = st.sidebar.number_input(
"Max pages to parse", min_value=1, value=10, help="Optional maximum number of pages to convert"
)
ocr_all_pages = st.sidebar.checkbox(
"Force OCR on all pages", help="Force OCR on all pages, even if they are images", value=False
)
if in_file is None:
st.stop()
filetype = in_file.type
with col1:
page_count = page_count(in_file)
page_number = st.number_input(f"Page number out of {page_count}:", min_value=1, value=1, max_value=page_count)
pil_image = get_page_image(in_file, page_number)
st.image(pil_image, caption="PDF file (preview)", use_column_width=True)
run_marker = st.sidebar.button("Run Marker")
if not run_marker:
st.stop()
# Run Marker
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
temp_pdf.write(in_file.getvalue())
temp_pdf.seek(0)
filename = temp_pdf.name
md_text, images, out_meta = convert_pdf(filename, languages, max_pages, start_page - 1, ocr_all_pages)
md_text = markdown_insert_images(md_text, images)
with col2:
st.markdown(md_text, unsafe_allow_html=True)