working on html compression

eyaler · Oct 6, 2022 · 2ca0d7c · 2ca0d7c
1 parent 364d133
commit 2ca0d7c
Show file tree

Hide file tree

Showing 12 changed files with 81 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ The pipeline includes efficient binary-to-text alternatives to Base64 which are
 | Project Gutenberg plain text utf8                                                         | txt           | 63.7 kB                                                          | 3.2 MB                                                            |
 | [paq8px_v206fix1](http://www.mattmahoney.net/dc/text.html#1250) -12RT (excluding decoder) | paq           | 13.3 kB (21%)                                                    | 575 kB (18%)                                                      |
 | 7-Zip 22.01 9 Ultra PPMd (excluding decoder)                                              | 7z            | 20.8 kB (32%)                                                    | 746 kB (23%)                                                      |
-| 7-Zip 22.01 9 Ultra PPMd (self extracting)                                                | exe           | 232 kB (364%)                                                    | 958 kB (29%)                                                      |
+| 7-Zip 22.01 9 Ultra PPMd (self-extracting)                                                | exe           | 232 kB (364%)                                                    | 958 kB (29%)                                                      |
 | [Roadroller](https://github.com/lifthrasiir/roadroller) 2.1.0 -O2                         | js            | 26.5 kB (42%)                                                    | 1.0 MB (30%)                                                      |
 | **ZTML Base125**                                                                          | html (utf8)   | 26.5 kB (42%) `mtf=0`                                            | 916 kB (28%) `mtf=80`                                             |
 | **ZTML crEnc**                                                                            | html (cp1252) | 23.8 kB (37%) `mtf=0`                                            | 818 kB (25%) `mtf=80`                                             |
@@ -42,6 +42,8 @@ and [example_image.py](example_image.py) for an inline image encoding example.
 Outputs of these runs can be accessed at [eyalgruss.com/ztml](https://eyalgruss.com/ztml).
 On top of the built-in validations for Chrome, Edge and Firefox, these were also manually tested on macOS Monterey 12.5 Safari 15.6 and iOS 16.0 Safari.
 
+A quick and dirty way to compress an existing single-page websites with embedded inline media is to use `raw=True`.
+
 ### Caveats
 1. Files larger than a few MB might not work on [iOS Safari](https://pqina.nl/blog/canvas-area-exceeds-the-maximum-limit) or [macOS Safari 15](https://bugs.webkit.org/show_bug.cgi?id=230855).
 2. This solution favors compression rate over compression and decompression times. Use `mtf=None` for faster decompression of large files.
@@ -50,7 +52,7 @@ On top of the built-in validations for Chrome, Edge and Firefox, these were also
 ### ZTML pipeline breakdown
 1. [Text normalization](ztml/text_prep.py) (irreversible; reduce whitespace, substitute unicode punctuation)
 2. [Text condensation](ztml/text_prep.py) (reversible; lowercase with automatic capitalization, substitute common strings as: the, qu)
-3. [Burrows–Wheeler + Move-to-front transforms](ztml/bwt_mtf.py) on text with some optional variants, including some new ones (beneficial for large texts)
+3. [Burrows–Wheeler + Move-to-front transforms](ztml/bwt_mtf.py) on text with some optional variants, including some new ones (beneficial for large texts with higher mtf settings)
 4. [Huffman encoding](ztml/huffman.py) (with a [codebook-free decoder](https://researchgate.net/publication/3159499_On_the_implementation_of_minimum_redundancy_prefix_codes), beneficial even as followed by DEFLATE)
 5. [Burrows–Wheeler transform](ztml/bwt_mtf.py) on bits (beneficial for large texts)
 6. [PNG / DEFLATE compression](ztml/deflate.py) (allowing [native decompression](https://web.archive.org/web/20090220141811/http://blog.nihilogic.dk/2008/05/compression-using-canvas-and-png.html

diff --git a/TODO.md b/TODO.md
@@ -1,7 +1,7 @@
 # Todo
 
 ### Usability
-- Support encoding video/audio
+- Support encoding video/audio/fonts/...
 - Support encoding multiple media elements
 - Linux installation instructions / Enable validation in Google Colab
 - Make into a PIP library and start doing versioning

diff --git a/example.py b/example.py
@@ -9,11 +9,12 @@
 books = [30123, 2600]
 mtf_variants = [0, 80]
 output_folder = 'output'
-skip_exists = True
+skip_download_exists = True
 element_id = ''
 
 
 assert len(books) == len(mtf_variants)
+error = False
 for item, mtf in zip(books, mtf_variants):
     item_start_time = time()
     filenames = dict(raw=f'{item}.txt',
@@ -27,7 +28,7 @@
     filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()}
 
     # If missing, download an example file from the web
-    if not skip_exists or not os.path.exists(filenames['raw']):
+    if not skip_download_exists or not os.path.exists(filenames['raw']):
         from gutenberg.acquire.text import load_etext
         with open(filenames['raw'], 'wb') as f:
             f.write(load_etext(item).encode())
@@ -37,16 +38,18 @@
 
     cnt = 0
     for label, filename in filenames.items():
-        ext = os.path.splitext(filename)[-1][1:]
-        if ext not in ['js', 'html']:
+        if label == 'raw':
             continue
-        file = ztml.ztml(data, filename, mtf=mtf, bin2txt=label.split('_', 1)[0], element_id=element_id)
+        file = ztml.ztml(data, filename, mtf=mtf, bin2txt=label.rsplit('_', 1)[0], element_id=element_id)
         cnt += 1
 
     print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.')
 
     # Compare file sizes and validate data is recovered
-    validation.validate_files(filenames, by='id' if element_id else '', element=element_id)
+    error |= validation.validate_files(filenames, by='id' if element_id else '', element=element_id)
     print()
 
-print(f'Total of {len(books)} books took {(time()-start_time) / 60 :.1f} min.')
+if error:
+    print('Error: some renderings timed out')
+else:
+    print(f'Total of {len(books)} books took {(time()-start_time) / 60 :.1f} min.')
diff --git a/example_image.py b/example_image.py
@@ -14,10 +14,11 @@
               'http://wiesmann.codiferes.net/share/bitmaps/test_pattern.webp'
               ]
 output_folder = 'output'
-skip_exists = True
+skip_download_exists = True
 element_id = ''
 
 
+error = False
 for url in image_urls:
     item_start_time = time()
     item = url.rsplit('/', 1)[-1]
@@ -32,7 +33,7 @@
     filenames = {k: os.path.join(output_folder, v) for k, v in filenames.items()}
 
     # If missing, download an example file from the web
-    if not skip_exists or not os.path.exists(filenames['raw']):
+    if not skip_download_exists or not os.path.exists(filenames['raw']):
         with urlopen(url) as fin, open(filenames['raw'], 'wb') as fout:
             fout.write(fin.read())
 
@@ -41,16 +42,18 @@
 
     cnt = 0
     for label, filename in filenames.items():
-        ext = os.path.splitext(filename)[-1][1:]
-        if ext not in ['js', 'html']:
+        if label == 'raw':
             continue
-        file = ztml.ztml(data, filename, bin2txt=label.split('_', 1)[0], element_id=element_id, image=True)
+        file = ztml.ztml(data, filename, bin2txt=label.rsplit('_', 1)[0], element_id=element_id, image=True)
         cnt += 1
 
     print(f'{cnt} encodings of {item} took {(time()-item_start_time) / 60 :.1f} min.')
 
     # Compare file sizes and validate data is recovered
-    validation.validate_files(filenames, by='id' if element_id else '', element=element_id, image=True)
+    error |= validation.validate_files(filenames, by='id' if element_id else '', element=element_id, image=True)
     print()
 
-print(f'Total of {len(image_urls)} images took {(time()-start_time) / 60 :.1f} min.')
+if error:
+    print('Error: some renderings timed out')
+else:
+    print(f'Total of {len(image_urls)} images took {(time()-start_time) / 60 :.1f} min.')
diff --git a/misc/minibook.py b/misc/minibook.py
@@ -7,6 +7,6 @@
 
 
 with urlopen('https://xem.github.io/miniBook/example') as f:
-    out, result = ztml.ztml(f.read(), f'index.html', mtf=80, raw=True, validate=True, ignore_regex='</xmp>')
+    out, result = ztml.ztml(f.read(), f'index.html', mtf=80, raw=True, validate=True)
     print(f'{len(out):,} B')
     assert not result
diff --git a/ztml/bwt_mtf.py b/ztml/bwt_mtf.py
@@ -1,6 +1,9 @@
 """Burrows-Wheeler and Move-to-front transforms
 
-Implementation follows pydivsufsort tests, to unnecessitate adding an EOF token.
+Applies alphabet reordering by default to concentrate the vowels together.
+BWT Implementation follows pydivsufsort tests, to unnecessitate adding an EOF token.
+MTF includes new variants (50-90), where larger texts benefit from higher mtf settings.
+Additional BWT on bits (after entropy coding) was found beneficial for large texts.
 
 References:
 https://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf
@@ -22,10 +25,10 @@
     from . import default_vars
 
 
-mtf_variants = [None, 0, 1, 2, 50, 52, 60, 70, 80, 90]
-default_mtf = 0
 order1 = 'AOUIEVWXYZaouievwxyz'
 order2 = 'VWXYZAOUIEvwxyzaouie'
+mtf_variants = [None, 0, 1, 2, 50, 52, 60, 70, 80, 90]
+default_mtf = 0
 
 
 reorder_table = str.maketrans(order1, order2)

diff --git a/ztml/default_vars.py b/ztml/default_vars.py
@@ -1,6 +1,6 @@
-bytearray = 'o'
-bitarray = 'b'
 bwt_func = '$'
+bitarray = 'b'
+bytearray = 'o'
+content = 't'
 image = 'i'
 payload = 's'
-content = 't'
diff --git a/ztml/huffman.py b/ztml/huffman.py
@@ -65,8 +65,8 @@ def encode(text: str,
 
 def get_js_decoder(charset: str,
                    canonical_table: str,
-                   text_var: str = default_vars.content,
                    bitarray_var: str = default_vars.bitarray,
+                   text_var: str = default_vars.content,
                    ) -> str:
     charset = charset.replace('\\', '\\\\').replace('\0', '\\0').replace('\n', '\\n').replace('\r', '\\r').replace("'", "\\'")
     return f'''s=[...'{charset}']
@@ -76,10 +76,10 @@ def get_js_decoder(charset: str,
 
 
 def encode_and_get_js_decoder(text: str,
-                              text_var: str = default_vars.content,
                               bitarray_var: str = default_vars.bitarray,
+                              text_var: str = default_vars.content,
                               validate: bool = True,
                               verbose: bool = False
                               ) -> Tuple[List[int], str]:
     bits, charset, canonical_table, _ = encode(text, validate, verbose)
-    return bits, get_js_decoder(charset, canonical_table, text_var, bitarray_var)
+    return bits, get_js_decoder(charset, canonical_table, bitarray_var, text_var)
diff --git a/ztml/tests.py b/ztml/tests.py
@@ -55,8 +55,6 @@
                             text = all_chars
                             if encoding == 'utf8':
                                 text = ''.join(c for c in text if ord(c) < bwt_mtf.surrogate_lo or ord(c) > bwt_mtf.surrogate_hi)
-                            if raw:
-                                text = ''.join(c for c in text if c not in ['\0', '\r'])
                             if mtf is not None:
                                 text = ''.join(c for c in text if ord(c) <= bwt_mtf.max_ord_for_mtf)
                             with open(input_filename, 'wb') as f:

diff --git a/ztml/text_prep.py b/ztml/text_prep.py
@@ -122,7 +122,7 @@ def encode_with_fallbacks(text: str,
         if the_fallback:
             if theless == text:
                 the = False
-            if the and '  ' in text:
+            if the and regex.search('(^| ) ', text, regex.MULTILINE):
                 the = False
                 if verbose:
                     print(f'Falling back to the={the}', file=sys.stderr)

diff --git a/ztml/validation.py b/ztml/validation.py
@@ -1,6 +1,5 @@
 from base64 import b64decode
 from contextlib import ExitStack, redirect_stdout
-import html
 import os
 import sys
 from tempfile import NamedTemporaryFile
@@ -79,13 +78,14 @@ def get_browser(browser: BrowserType, stack: Optional[ExitStack] = None) -> WebD
 
 
 def render_html(file: AnyStr,
-                browser: BrowserType = default_browser,
-                timeout: int = default_timeout,
                 by: str = default_by,
                 element: str = default_element,
                 raw: bool = False,
                 image: bool = False,
+                browser: BrowserType = default_browser,
+                timeout: int = default_timeout,
                 bytearray_var: str = default_vars.bytearray,
+                content_var: str = default_vars.content
                 ) -> Optional[AnyStr]:
     if not by:
         by = default_by
@@ -119,15 +119,11 @@ def render_html(file: AnyStr,
                     image_data = [v for k, v in sorted(image_data.items(), key=lambda x: int(x[0]))]
                 return bytes(image_data)
             if raw:
-                by = By.TAG_NAME
-                element = 'body'
-                sleep(1)
+                sleep(0.1)
             wait.until(lambda x: x.find_element(by, element).text)
-            text_property = 'innerHTML' if raw else 'innerText'
-            out = browser.find_element(by, element).get_property(text_property)
             if raw:
-                out = html.unescape(out)
-            return out
+                return browser.execute_script(f'return {content_var}')
+            return browser.find_element(by, element).get_property('innerText')
         except TimeoutException:
             return None
         except Exception:
@@ -152,18 +148,19 @@ def find_first_diff(render: AnyStr, data: AnyStr, verbose: bool = True) -> int:
 def validate_html(file: AnyStr,
                   data: AnyStr,
                   caps: str = text_prep.default_caps,
-                  ignore_regex: str = '',
-                  unicode_A: int = 0,
-                  browser: BrowserType = default_browser,
-                  timeout: int = default_timeout,
                   by: str = default_by,
                   element: str = default_element,
                   raw: bool = False,
                   image: bool = False,
+                  browser: BrowserType = default_browser,
+                  timeout: int = default_timeout,
+                  ignore_regex: str = '',
+                  unicode_A: int = 0,
                   bytearray_var: str = default_vars.bytearray,
+                  content_var: str = default_vars.content,
                   verbose: bool = True
                   ) -> Optional[bool]:
-    render = render_html(file, browser, timeout, by, element, raw, image, bytearray_var)
+    render = render_html(file, by, element, raw, image, browser, timeout, bytearray_var, content_var)
     if render is None:
         return None
     if not image:
@@ -173,6 +170,7 @@ def validate_html(file: AnyStr,
             data = data.upper()
         elif caps == 'simple':
             data = text_prep.decode_caps_simple(data.lower())
+    if not image and not raw:
         render = regex.sub(ignore_regex, '', render)
         if unicode_A:
             render = regex.sub('[^\\p{Z}\\p{C}]', lambda m: chr(ord(m[0]) - unicode_A + 65 + (6 if ord(m[0]) - unicode_A + 65 > 90 else 0)), render)
@@ -189,42 +187,44 @@ def validate_files(filenames: Mapping[str, str],
                    unix_newline: bool = True,
                    fix_punct: bool = False,
                    caps: str = text_prep.default_caps,
-                   ignore_regex: str = '',
-                   unicode_A: int = 0,
                    by: str = default_by,
                    element: str = default_element,
                    raw: bool = False,
                    image: bool = False,
-                   bytearray_var: str = default_vars.bytearray,
                    browsers: Optional[Union[BrowserType, Iterable[BrowserType]]] = None,
                    timeout: int = default_timeout,
+                   ignore_regex: str = '',
+                   unicode_A: int = 0,
                    payload_var: str = default_vars.payload,
+                   bytearray_var: str = default_vars.bytearray,
+                   content_var: str = default_vars.content,
                    validate: bool = True,
                    verbose: bool = True
-                   ) -> None:
+                   ) -> bool:
+    error = False
     if browsers is None:
         browsers = list(drivers)
     elif isinstance(browsers, (str, WebDriver)):
         browsers = [browsers]
     with ExitStack() as stack:
-        browsers = [get_browser(browser, stack) for browser in browsers]
+        if validate:
+            browsers = [get_browser(browser, stack) for browser in browsers]
         raw_size = None
         base64_size = None
-        for label, filename in filenames.items():
+        for label, filename in sorted(filenames.items(), key=lambda x: (x[0] != 'raw', x[0] != 'base64_html')):
             ext = os.path.splitext(filename)[-1][1:]
             if raw_size is not None and ext != 'html' or not os.path.exists(filename):
                 continue
             size = os.path.getsize(filename)
             if data is None:
-                assert ext != 'html', filename
                 if ext.lower() in ['bmp', 'gif', 'jpeg', 'jpg', 'png', 'webp']:
                     image = True
                 with open(filename, 'rb') as f:
                     data = f.read()
                     if not image:
                         data = text_prep.normalize(data.decode(), reduce_whitespace, unix_newline, fix_punct)  # Assumes first text file is utf8. Otherwise, you can pass the text argument
             if raw_size is None:
-                raw_size = size if ext != 'html' else len(data.encode())
+                raw_size = size if label == 'raw' else len(data.encode())
             if label == 'base64_html':
                 base64_size = size * 3 / 4
             if verbose:
@@ -233,7 +233,7 @@ def validate_files(filenames: Mapping[str, str],
                     stats.append(f'ratio={round(size / raw_size * 100, 1)}%')
                 if base64_size:
                     stats.append(f'overhead={round((size/base64_size-1) * 100, 1)}%')
-                if ext == 'html' and label != 'base64_html':
+                if ext == 'html' and label not in ['raw', 'base64_html']:
                     with open(filename, 'rb') as f:
                         script = f.read()
                         script = script.replace(max(regex.finditer(webify.get_literals_regex(payload_var).encode(), script),
@@ -248,14 +248,16 @@ def validate_files(filenames: Mapping[str, str],
                 kb = size / 1024
                 if kb >= 0.1:
                     stats = f' = {round(kb, 1):,} kB' + stats
-                print(f"{full_path(filename)} {size:,} B{stats}", end='' if validate and ext == 'html' else None, file=sys.stderr)
-            if validate and ext == 'html':
+                print(f"{full_path(filename)} {size:,} B{stats}", end='' if validate and ext == 'html' and label != 'raw' else None, file=sys.stderr)
+            if validate and ext == 'html' and label != 'raw':
                 for i, browser in enumerate(browsers):
                     start_time = time()
-                    valid = validate_html(filename, data, caps, ignore_regex, unicode_A,
-                                          browser, timeout, by, element, raw, image,
-                                          bytearray_var, verbose)
+                    valid = validate_html(filename, data, caps, by, element, raw, image,
+                                          browser, timeout, ignore_regex, unicode_A,
+                                          bytearray_var, content_var, verbose)
                     assert valid is not False, filename
+                    if not valid:
+                        error = True
                     if verbose:
                         if not i:
                             print(f' rendering secs:', end='', file=sys.stderr)
@@ -264,3 +266,4 @@ def validate_files(filenames: Mapping[str, str],
                     print(file=sys.stderr)
         if verbose and validate:
             print('Note: above rendering times from Selenium are much longer than actual browser rendering.', file=sys.stderr)
+    return error