Skip to content

Commit

Permalink
working on html compression
Browse files Browse the repository at this point in the history
  • Loading branch information
eyaler committed Oct 6, 2022
1 parent 364d133 commit 2ca0d7c
Show file tree
Hide file tree
Showing 12 changed files with 81 additions and 66 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ The pipeline includes efficient binary-to-text alternatives to Base64 which are
| Project Gutenberg plain text utf8 | txt | 63.7 kB | 3.2 MB |
| [paq8px_v206fix1](http://www.mattmahoney.net/dc/text.html#1250) -12RT (excluding decoder) | paq | 13.3 kB (21%) | 575 kB (18%) |
| 7-Zip 22.01 9 Ultra PPMd (excluding decoder) | 7z | 20.8 kB (32%) | 746 kB (23%) |
| 7-Zip 22.01 9 Ultra PPMd (self extracting) | exe | 232 kB (364%) | 958 kB (29%) |
| 7-Zip 22.01 9 Ultra PPMd (self-extracting) | exe | 232 kB (364%) | 958 kB (29%) |
| [Roadroller](https://github.com/lifthrasiir/roadroller) 2.1.0 -O2 | js | 26.5 kB (42%) | 1.0 MB (30%) |
| **ZTML Base125** | html (utf8) | 26.5 kB (42%) `mtf=0` | 916 kB (28%) `mtf=80` |
| **ZTML crEnc** | html (cp1252) | 23.8 kB (37%) `mtf=0` | 818 kB (25%) `mtf=80` |
Expand All @@ -42,6 +42,8 @@ and [example_image.py](example_image.py) for an inline image encoding example.
Outputs of these runs can be accessed at [eyalgruss.com/ztml](https://eyalgruss.com/ztml).
On top of the built-in validations for Chrome, Edge and Firefox, these were also manually tested on macOS Monterey 12.5 Safari 15.6 and iOS 16.0 Safari.

A quick and dirty way to compress an existing single-page websites with embedded inline media is to use `raw=True`.

### Caveats
1. Files larger than a few MB might not work on [iOS Safari](https://pqina.nl/blog/canvas-area-exceeds-the-maximum-limit) or [macOS Safari 15](https://bugs.webkit.org/show_bug.cgi?id=230855).
2. This solution favors compression rate over compression and decompression times. Use `mtf=None` for faster decompression of large files.
Expand All @@ -50,7 +52,7 @@ On top of the built-in validations for Chrome, Edge and Firefox, these were also
### ZTML pipeline breakdown
1. [Text normalization](ztml/text_prep.py) (irreversible; reduce whitespace, substitute unicode punctuation)
2. [Text condensation](ztml/text_prep.py) (reversible; lowercase with automatic capitalization, substitute common strings as: the, qu)
3. [Burrows–Wheeler + Move-to-front transforms](ztml/bwt_mtf.py) on text with some optional variants, including some new ones (beneficial for large texts)
3. [Burrows–Wheeler + Move-to-front transforms](ztml/bwt_mtf.py) on text with some optional variants, including some new ones (beneficial for large texts with higher mtf settings)
4. [Huffman encoding](ztml/huffman.py) (with a [codebook-free decoder](https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes), beneficial even as followed by DEFLATE)
5. [Burrows–Wheeler transform](ztml/bwt_mtf.py) on bits (beneficial for large texts)
6. [PNG / DEFLATE compression](ztml/deflate.py) (allowing [native decompression](https://web.archive.org/web/20090220141811/http://blog.nihilogic.dk/2008/05/compression-using-canvas-and-png.html
Expand Down
2 changes: 1 addition & 1 deletion TODO.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Todo

### Usability
- Support encoding video/audio
- Support encoding video/audio/fonts/...
- Support encoding multiple media elements
- Linux installation instructions / Enable validation in Google Colab
- Make into a PIP library and start doing versioning
Expand Down
17 changes: 10 additions & 7 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
books = [30123, 2600]
mtf_variants = [0, 80]
output_folder = 'output'
skip_exists = True
skip_download_exists = True
element_id = ''


assert len(books) == len(mtf_variants)
error = False
for item, mtf in zip(books, mtf_variants):
item_start_time = time()
filenames = dict(raw=f'{item}.txt',
Expand All @@ -27,7 +28,7 @@
filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()}

# If missing, download an example file from the web
if not skip_exists or not os.path.exists(filenames['raw']):
if not skip_download_exists or not os.path.exists(filenames['raw']):
from gutenberg.acquire.text import load_etext
with open(filenames['raw'], 'wb') as f:
f.write(load_etext(item).encode())
Expand All @@ -37,16 +38,18 @@

cnt = 0
for label, filename in filenames.items():
ext = os.path.splitext(filename)[-1][1:]
if ext not in ['js', 'html']:
if label == 'raw':
continue
file = ztml.ztml(data, filename, mtf=mtf, bin2txt=label.split('_', 1)[0], element_id=element_id)
file = ztml.ztml(data, filename, mtf=mtf, bin2txt=label.rsplit('_', 1)[0], element_id=element_id)
cnt += 1

print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.')

# Compare file sizes and validate data is recovered
validation.validate_files(filenames, by='id' if element_id else '', element=element_id)
error |= validation.validate_files(filenames, by='id' if element_id else '', element=element_id)
print()

print(f'Total of {len(books)} books took {(time()-start_time) / 60 :.1f} min.')
if error:
print('Error: some renderings timed out')
else:
print(f'Total of {len(books)} books took {(time()-start_time) / 60 :.1f} min.')
17 changes: 10 additions & 7 deletions example_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.webp'
]
output_folder = 'output'
skip_exists = True
skip_download_exists = True
element_id = ''


error = False
for url in image_urls:
item_start_time = time()
item = url.rsplit('/', 1)[-1]
Expand All @@ -32,7 +33,7 @@
filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()}

# If missing, download an example file from the web
if not skip_exists or not os.path.exists(filenames['raw']):
if not skip_download_exists or not os.path.exists(filenames['raw']):
with urlopen(url) as fin, open(filenames['raw'], 'wb') as fout:
fout.write(fin.read())

Expand All @@ -41,16 +42,18 @@

cnt = 0
for label, filename in filenames.items():
ext = os.path.splitext(filename)[-1][1:]
if ext not in ['js', 'html']:
if label == 'raw':
continue
file = ztml.ztml(data, filename, bin2txt=label.split('_', 1)[0], element_id=element_id, image=True)
file = ztml.ztml(data, filename, bin2txt=label.rsplit('_', 1)[0], element_id=element_id, image=True)
cnt += 1

print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.')

# Compare file sizes and validate data is recovered
validation.validate_files(filenames, by='id' if element_id else '', element=element_id, image=True)
error |= validation.validate_files(filenames, by='id' if element_id else '', element=element_id, image=True)
print()

print(f'Total of {len(image_urls)} images took {(time()-start_time) / 60 :.1f} min.')
if error:
print('Error: some renderings timed out')
else:
print(f'Total of {len(image_urls)} images took {(time()-start_time) / 60 :.1f} min.')
2 changes: 1 addition & 1 deletion misc/minibook.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@


with urlopen('https://xem.github.io/miniBook/example') as f:
out, result = ztml.ztml(f.read(), f'index.html', mtf=80, raw=True, validate=True, ignore_regex='</xmp>')
out, result = ztml.ztml(f.read(), f'index.html', mtf=80, raw=True, validate=True)
print(f'{len(out):,} B')
assert not result
9 changes: 6 additions & 3 deletions ztml/bwt_mtf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""Burrows-Wheeler and Move-to-front transforms
Implementation follows pydivsufsort tests, to unnecessitate adding an EOF token.
Applies alphabet reordering by default to concentrate the vowels together.
BWT Implementation follows pydivsufsort tests, to unnecessitate adding an EOF token.
MTF includes new variants (50-90), where larger texts benefit from higher mtf settings.
Additional BWT on bits (after entropy coding) was found beneficial for large texts.
References:
https://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf
Expand All @@ -22,10 +25,10 @@
from . import default_vars


mtf_variants = [None, 0, 1, 2, 50, 52, 60, 70, 80, 90]
default_mtf = 0
order1 = 'AOUIEVWXYZaouievwxyz'
order2 = 'VWXYZAOUIEvwxyzaouie'
mtf_variants = [None, 0, 1, 2, 50, 52, 60, 70, 80, 90]
default_mtf = 0


reorder_table = str.maketrans(order1, order2)
Expand Down
6 changes: 3 additions & 3 deletions ztml/default_vars.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
bytearray = 'o'
bitarray = 'b'
bwt_func = '$'
bitarray = 'b'
bytearray = 'o'
content = 't'
image = 'i'
payload = 's'
content = 't'
6 changes: 3 additions & 3 deletions ztml/huffman.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ def encode(text: str,

def get_js_decoder(charset: str,
canonical_table: str,
text_var: str = default_vars.content,
bitarray_var: str = default_vars.bitarray,
text_var: str = default_vars.content,
) -> str:
charset = charset.replace('\\', '\\\\').replace('\0', '\\0').replace('\n', '\\n').replace('\r', '\\r').replace("'", "\\'")
return f'''s=[...'{charset}']
Expand All @@ -76,10 +76,10 @@ def get_js_decoder(charset: str,


def encode_and_get_js_decoder(text: str,
text_var: str = default_vars.content,
bitarray_var: str = default_vars.bitarray,
text_var: str = default_vars.content,
validate: bool = True,
verbose: bool = False
) -> Tuple[List[int], str]:
bits, charset, canonical_table, _ = encode(text, validate, verbose)
return bits, get_js_decoder(charset, canonical_table, text_var, bitarray_var)
return bits, get_js_decoder(charset, canonical_table, bitarray_var, text_var)
2 changes: 0 additions & 2 deletions ztml/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@
text = all_chars
if encoding == 'utf8':
text = ''.join(c for c in text if ord(c) < bwt_mtf.surrogate_lo or ord(c) > bwt_mtf.surrogate_hi)
if raw:
text = ''.join(c for c in text if c not in ['\0', '\r'])
if mtf is not None:
text = ''.join(c for c in text if ord(c) <= bwt_mtf.max_ord_for_mtf)
with open(input_filename, 'wb') as f:
Expand Down
2 changes: 1 addition & 1 deletion ztml/text_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def encode_with_fallbacks(text: str,
if the_fallback:
if theless == text:
the = False
if the and ' ' in text:
if the and regex.search('(^| ) ', text, regex.MULTILINE):
the = False
if verbose:
print(f'Falling back to the={the}', file=sys.stderr)
Expand Down
61 changes: 32 additions & 29 deletions ztml/validation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from base64 import b64decode
from contextlib import ExitStack, redirect_stdout
import html
import os
import sys
from tempfile import NamedTemporaryFile
Expand Down Expand Up @@ -79,13 +78,14 @@ def get_browser(browser: BrowserType, stack: Optional[ExitStack] = None) -> WebD


def render_html(file: AnyStr,
browser: BrowserType = default_browser,
timeout: int = default_timeout,
by: str = default_by,
element: str = default_element,
raw: bool = False,
image: bool = False,
browser: BrowserType = default_browser,
timeout: int = default_timeout,
bytearray_var: str = default_vars.bytearray,
content_var: str = default_vars.content
) -> Optional[AnyStr]:
if not by:
by = default_by
Expand Down Expand Up @@ -119,15 +119,11 @@ def render_html(file: AnyStr,
image_data = [v for k, v in sorted(image_data.items(), key=lambda x: int(x[0]))]
return bytes(image_data)
if raw:
by = By.TAG_NAME
element = 'body'
sleep(1)
sleep(0.1)
wait.until(lambda x: x.find_element(by, element).text)
text_property = 'innerHTML' if raw else 'innerText'
out = browser.find_element(by, element).get_property(text_property)
if raw:
out = html.unescape(out)
return out
return browser.execute_script(f'return {content_var}')
return browser.find_element(by, element).get_property('innerText')
except TimeoutException:
return None
except Exception:
Expand All @@ -152,18 +148,19 @@ def find_first_diff(render: AnyStr, data: AnyStr, verbose: bool = True) -> int:
def validate_html(file: AnyStr,
data: AnyStr,
caps: str = text_prep.default_caps,
ignore_regex: str = '',
unicode_A: int = 0,
browser: BrowserType = default_browser,
timeout: int = default_timeout,
by: str = default_by,
element: str = default_element,
raw: bool = False,
image: bool = False,
browser: BrowserType = default_browser,
timeout: int = default_timeout,
ignore_regex: str = '',
unicode_A: int = 0,
bytearray_var: str = default_vars.bytearray,
content_var: str = default_vars.content,
verbose: bool = True
) -> Optional[bool]:
render = render_html(file, browser, timeout, by, element, raw, image, bytearray_var)
render = render_html(file, by, element, raw, image, browser, timeout, bytearray_var, content_var)
if render is None:
return None
if not image:
Expand All @@ -173,6 +170,7 @@ def validate_html(file: AnyStr,
data = data.upper()
elif caps == 'simple':
data = text_prep.decode_caps_simple(data.lower())
if not image and not raw:
render = regex.sub(ignore_regex, '', render)
if unicode_A:
render = regex.sub('[^\\p{Z}\\p{C}]', lambda m: chr(ord(m[0]) - unicode_A + 65 + (6 if ord(m[0]) - unicode_A + 65 > 90 else 0)), render)
Expand All @@ -189,42 +187,44 @@ def validate_files(filenames: Mapping[str, str],
unix_newline: bool = True,
fix_punct: bool = False,
caps: str = text_prep.default_caps,
ignore_regex: str = '',
unicode_A: int = 0,
by: str = default_by,
element: str = default_element,
raw: bool = False,
image: bool = False,
bytearray_var: str = default_vars.bytearray,
browsers: Optional[Union[BrowserType, Iterable[BrowserType]]] = None,
timeout: int = default_timeout,
ignore_regex: str = '',
unicode_A: int = 0,
payload_var: str = default_vars.payload,
bytearray_var: str = default_vars.bytearray,
content_var: str = default_vars.content,
validate: bool = True,
verbose: bool = True
) -> None:
) -> bool:
error = False
if browsers is None:
browsers = list(drivers)
elif isinstance(browsers, (str, WebDriver)):
browsers = [browsers]
with ExitStack() as stack:
browsers = [get_browser(browser, stack) for browser in browsers]
if validate:
browsers = [get_browser(browser, stack) for browser in browsers]
raw_size = None
base64_size = None
for label, filename in filenames.items():
for label, filename in sorted(filenames.items(), key=lambda x: (x[0] != 'raw', x[0] != 'base64_html')):
ext = os.path.splitext(filename)[-1][1:]
if raw_size is not None and ext != 'html' or not os.path.exists(filename):
continue
size = os.path.getsize(filename)
if data is None:
assert ext != 'html', filename
if ext.lower() in ['bmp', 'gif', 'jpeg', 'jpg', 'png', 'webp']:
image = True
with open(filename, 'rb') as f:
data = f.read()
if not image:
data = text_prep.normalize(data.decode(), reduce_whitespace, unix_newline, fix_punct) # Assumes first text file is utf8. Otherwise, you can pass the text argument
if raw_size is None:
raw_size = size if ext != 'html' else len(data.encode())
raw_size = size if label == 'raw' else len(data.encode())
if label == 'base64_html':
base64_size = size * 3 / 4
if verbose:
Expand All @@ -233,7 +233,7 @@ def validate_files(filenames: Mapping[str, str],
stats.append(f'ratio={round(size / raw_size * 100, 1)}%')
if base64_size:
stats.append(f'overhead={round((size/base64_size-1) * 100, 1)}%')
if ext == 'html' and label != 'base64_html':
if ext == 'html' and label not in ['raw', 'base64_html']:
with open(filename, 'rb') as f:
script = f.read()
script = script.replace(max(regex.finditer(webify.get_literals_regex(payload_var).encode(), script),
Expand All @@ -248,14 +248,16 @@ def validate_files(filenames: Mapping[str, str],
kb = size / 1024
if kb >= 0.1:
stats = f' = {round(kb, 1):,} kB' + stats
print(f"{full_path(filename)} {size:,} B{stats}", end='' if validate and ext == 'html' else None, file=sys.stderr)
if validate and ext == 'html':
print(f"{full_path(filename)} {size:,} B{stats}", end='' if validate and ext == 'html' and label != 'raw' else None, file=sys.stderr)
if validate and ext == 'html' and label != 'raw':
for i, browser in enumerate(browsers):
start_time = time()
valid = validate_html(filename, data, caps, ignore_regex, unicode_A,
browser, timeout, by, element, raw, image,
bytearray_var, verbose)
valid = validate_html(filename, data, caps, by, element, raw, image,
browser, timeout, ignore_regex, unicode_A,
bytearray_var, content_var, verbose)
assert valid is not False, filename
if not valid:
error = True
if verbose:
if not i:
print(f' rendering secs:', end='', file=sys.stderr)
Expand All @@ -264,3 +266,4 @@ def validate_files(filenames: Mapping[str, str],
print(file=sys.stderr)
if verbose and validate:
print('Note: above rendering times from Selenium are much longer than actual browser rendering.', file=sys.stderr)
return error
Loading

0 comments on commit 2ca0d7c

Please sign in to comment.