-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
base125.py
160 lines (135 loc) · 5.54 KB
/
base125.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""Base125 encoding based on Base122 and optimized for inline HTML / JS text compression and image encoding
If we must use utf8 encoding for HTML or JS, crEnc will not work.
Instead, we can use this original and unnecessarily-optimized version of the variable length Base122.
The original byte stream is split into 7 bit chunks,
which are encoded as a single byte: 0xxxxxxx, to comply with utf8 code point scheme.
We only use 125 byte values out of 128 (excluding CR, backslash and `)
and encode the remaining three with a double byte scheme: 110ssxxx 10xxxxxx,
where ss is 01, 10 or 11, and 9 bits are left for next data.
Alternatively, if these are the final 7 bits, we instead encode as: 1100010x 10xxxxxx.
As, we embed in JS template literals quotes ``, we further escape ${ with backslash.
The overhead is ~ 8/7 * 253/256 + 16/11 * 3/256 - 1 ~ 14.7% (compared to 33.3% for Base64).
The decoder further takes care of HTML character override for NUL.
An optimal global character modular offset can be added to minimize escaping, similar to dynEncode (disabled by default).
A minimalistic JS decoder code is generated.
References:
https://en.wikipedia.org/wiki/Binary-to-text_encoding
https://blog.kevinalbs.com/base122
https://github.com/kevinAlbs/Base122
https://github.com/eshaz/simple-yenc#what-is-dynencode
"""
from typing import Optional, Tuple
if not __package__:
import default_vars
else:
# noinspection PyPackages
from . import default_vars
illegal = ['', 13, 92, 96]
def encode(data: bytes, offset: int = 0, validate: bool = True) -> bytes:
cur_index = 0
cur_bit = 0 # Points to current bit needed
out = bytearray()
# Get 7 or 9 bits of input data. Returns None if there is no input left
def get_bits(length: int) -> Optional[int]:
nonlocal cur_index, cur_bit
if cur_index >= len(data):
return None
# Shift, mask, unshift to get first part. Align it to a 7 or 9 bit chunk
first_part = (255>>cur_bit & data[cur_index]+offset & 255) << cur_bit
diff = 8 - length
if diff > 0:
first_part >>= diff
else:
first_part <<= -diff
# Check if we need to go to the next byte for more bits
cur_bit += length
if cur_bit < 8:
return first_part # Do not need next byte
cur_bit -= 8
cur_index += 1
# Now we want bits [0..cur_bit] of the next byte if it exists
if cur_index >= len(data):
return first_part
# Align it
second_part = (0xff00>>cur_bit & data[cur_index]+offset & 255) >> 8-cur_bit
return first_part | second_part
while True:
# Grab 7 bits
bits = get_bits(7)
if bits is None:
break
try:
illegal_index = illegal.index(bits)
# Since this will be a two-byte character, get the next chunk of 9 bits
next_bits = get_bits(9)
if next_bits is None:
b1 = 4
next_bits = bits
else:
b1 = illegal_index << 3
# Push first 3 bits onto first byte, remaining 6 onto second
out.extend([192 | b1 | next_bits>>6, 128 | next_bits&63])
except ValueError:
out.append(bits)
if validate:
decoded = decode(out, offset)
assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30])
return out.replace(b'${', b'\\${')
def optimize_encode(data: bytes,
validate: bool = True
) -> Tuple[bytes, int, int]:
best_offset = 0
for offset in range(256):
length = len(encode(data, offset, validate=False))
if offset == 0:
best_length = length0 = length
if length < best_length:
best_length = length
best_offset = offset
out = encode(data, best_offset, validate)
return out, best_offset, length0 - best_length
def decode(data: bytes, offset: int = 0) -> bytes:
out = bytearray()
next_byte = 0
k = 0
def push_bits(bits: int, length: int = 7) -> None:
nonlocal next_byte, k
next_byte |= bits << (length < 8) >> k >> (length > 8)
k += length
if k > 7:
out.append((next_byte&255)-offset & 255)
k -= 8
next_byte = bits << 8-k
for byte in data.decode():
b = ord(byte)
if b > 127:
ss = b >> 9
if ss:
push_bits(illegal[ss])
push_bits(b<<2*(not ss) & 511, 9)
else:
push_bits(b)
return out
def get_js_decoder(data: bytes,
offset: Optional[int] = 0,
output_var: str = default_vars.bytearray,
validate: bool = True
) -> bytes:
if offset is None:
encoded, offset, saved = optimize_encode(data, validate) # Time-consuming op.
else:
encoded = encode(data, offset, validate)
illegal_str = ','.join(str(i) for i in illegal)
first_part = f'''k=n=0
p=(b,l=7)=>(n|=b<<(l<8)>>k>>(l>8),k+=l,k>7?(v=n{-offset or ''},k-=8,n=b<<8-k,v):[])
{output_var}=new Uint8Array([...`'''
last_part = f'`].flatMap(c=>(i=c.charCodeAt()%65533,i>127?(e=i>>9,[e?p([{illegal_str}][e]):[],p(i<<2*!e&511,9)].flat()):p(i))))\n'
return first_part.encode() + encoded + last_part.encode()
def test() -> None:
for i in range(100):
for j in range(100):
for offset in [0, 1]:
for symbol in [b'\r', b'\\', b'`']:
encode(b'\0'*i + symbol*j, offset, validate=True)
if __name__ == '__main__':
test()