-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbase125.py
163 lines (139 loc) · 5.52 KB
/
base125.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""Base125 encoding based on Base122 and optimized for inline HTML / JS text compression
If we must use utf8 encoding for HTML or JS, crEnc will not work.
Instead, we can use this unnecessarily optimized version of the variable length Base122.
The original byte stream is split into 7 bit chunks,
which are encoded as a single byte: 0xxxxxxx, to comply with utf8 code point scheme.
We only use 125 byte values out of 128 (excluding CR, \ and `)
and encode the remaining three in a double byte scheme: 110ssxxx 10xxxxxx,
where ss is 01, 10 or 11, and 9 bits are left for next data.
alternatively, if these are the final 7 bits, we instead encode as: 1100010x 10xxxxxx.
We then embed in JS template literals quotes ``, after escaping ${ with a \
An optimal overall offset can be added to minimize escaping as suggested in dynEncode.
The decoder further takes care of HTML character override for NUL.
A minimalistic JS decoder code is generated.
The overhead is ~ 8/log2(125)-1 ~ 15% (compared to 33.3% for Base64).
References:
https://blog.kevinalbs.com/base122
https://github.com/kevinAlbs/Base122
https://github.com/eshaz/simple-yenc#what-is-dynencode
"""
from typing import Optional, Tuple
if not __package__:
import default_vars
else:
from . import default_vars
illegal = ['', 13, 92, 96]
def encode(data: bytes, offset: int = 0, validate: bool = True) -> bytes:
curIndex = 0
curBit = 0 # Points to current bit needed
out = bytearray()
# Get 7 or 9 bits of input data. Returns false if there is no input left
def get_bits(length : int) -> Optional[int]:
nonlocal curIndex, curBit
if curIndex >= len(data):
return None
# Shift, mask, unshift to get first part. Align it to a 7 or 9 bit chunk
firstPart = (255>>curBit & data[curIndex]+offset & 255) << curBit
diff = 8 - length
if diff > 0:
firstPart >>= diff
else:
firstPart <<= -diff
# Check if we need to go to the next byte for more bits
curBit += length
if curBit < 8:
return firstPart # Do not need next byte
curBit -= 8
curIndex += 1
# Now we want bits [0..curBit] of the next byte if it exists
if curIndex >= len(data):
return firstPart
# Align it
secondPart = (0xFF00>>curBit & data[curIndex]+offset & 255) >> 8-curBit
return firstPart | secondPart
while True:
# Grab 7 bits
bits = get_bits(7)
if bits is None:
break
try:
illegalIndex = illegal.index(bits)
# Since this will be a two-byte character, get the next chunk of 9 bits
nextBits = get_bits(9)
if nextBits is None:
b1 = 4
nextBits = bits
else:
b1 = illegalIndex << 3
# Push first 3 bits onto first byte, remaining 6 onto second
out.extend([192 | b1 | nextBits>>6, 128 | nextBits&63])
except ValueError:
out.append(bits)
if validate:
decoded = decode(out, offset)
assert decoded == data, (len(decoded), len(data), decoded[:30], data[:30])
return out.replace(b'${', b'\\${')
def optimize_encode(data: bytes,
validate: bool = True
) -> Tuple[bytes, int, int]:
best_offset = 0
for offset in range(256):
length = len(encode(data, offset, validate=False))
if offset == 0:
best_length = length0 = length
if length < best_length:
best_length = length
best_offset = offset
out = encode(data, best_offset, validate)
return out, best_offset, length0 - best_length
def decode(data: bytes, offset: int = 0) -> bytes:
out = bytearray()
next_byte = 0
k = 0
def push_bits(bits: int, length: int = 7) -> None:
nonlocal next_byte, k
next_byte |= bits << (length < 8) >> k >> (length > 8)
k += length
if k > 7:
out.append(((next_byte&255)-offset) % 256)
k -= 8
next_byte = bits << 8-k
for byte in data.decode():
b = ord(byte)
if b > 127:
ss = b >> 9
if ss:
push_bits(illegal[ss])
push_bits(b<<2*(not ss) & 511, 9)
else:
push_bits(b)
return out
def get_js_decoder(data: bytes,
offset: Optional[int] = 0,
output_var: str = default_vars.bytearray,
validate: bool = True
) -> bytes:
if offset is None:
encoded, offset, saved = optimize_encode(data, validate) # Time-consuming op.
else:
encoded = encode(data, offset, validate)
illegal_str = ','.join(str(i) for i in illegal)
last_part = f'''`
{output_var}=new Uint8Array(s.length*2)
j=k=n=0
p=(b,l=7)=>{{n|=b<<(l<8)>>k>>(l>8);k+=l;k>7&&({output_var}[j++]=n{-offset or ''},k-=8,n=b<<8-k)}}
for(c of s)(i=c.charCodeAt()%65533)>127?(e=i>>9,e&&p([{illegal_str}][e]),p(i<<2*!e&511,9)):p(i)
{output_var}={output_var}.slice(0,j)
'''
return b's=`' + encoded + last_part.encode()
def test() -> None:
for i in range(100):
for j in range(100):
encode(b'\0'*i + b'\r'*j, validate=True)
encode(b'\0'*i + b'\\'*j, validate=True)
encode(b'\0'*i + b'`'*j, validate=True)
encode(b'\0'*i + b'\r'*j, offset=1, validate=True)
encode(b'\0'*i + b'\\'*j, offset=1, validate=True)
encode(b'\0'*i + b'`'*j, offset=1, validate=True)
if __name__ == '__main__':
test()