micropython: add micropython component
This commit is contained in:
205
components/language/micropython/py/makecompresseddata.py
Normal file
205
components/language/micropython/py/makecompresseddata.py
Normal file
@@ -0,0 +1,205 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import re
|
||||
import sys
|
||||
|
||||
import gzip
|
||||
import zlib
|
||||
|
||||
|
||||
_COMPRESSED_MARKER = 0xFF
|
||||
|
||||
|
||||
def check_non_ascii(msg):
|
||||
for c in msg:
|
||||
if ord(c) >= 0x80:
|
||||
print(
|
||||
'Unable to generate compressed data: message "{}" contains a non-ascii character "{}".'.format(
|
||||
msg, c
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# Replace <char><space> with <char | 0x80>.
|
||||
# Trival scheme to demo/test.
|
||||
def space_compression(error_strings):
|
||||
for line in error_strings:
|
||||
check_non_ascii(line)
|
||||
result = ""
|
||||
for i in range(len(line)):
|
||||
if i > 0 and line[i] == " ":
|
||||
result = result[:-1]
|
||||
result += "\\{:03o}".format(ord(line[i - 1]))
|
||||
else:
|
||||
result += line[i]
|
||||
error_strings[line] = result
|
||||
return None
|
||||
|
||||
|
||||
# Replace common words with <0x80 | index>.
|
||||
# Index is into a table of words stored as aaaaa<0x80|a>bbb<0x80|b>...
|
||||
# Replaced words are assumed to have spaces either side to avoid having to store the spaces in the compressed strings.
|
||||
def word_compression(error_strings):
|
||||
topn = collections.Counter()
|
||||
|
||||
for line in error_strings.keys():
|
||||
check_non_ascii(line)
|
||||
for word in line.split(" "):
|
||||
topn[word] += 1
|
||||
|
||||
# Order not just by frequency, but by expected saving. i.e. prefer a longer string that is used less frequently.
|
||||
# Use the word itself for ties so that compression is deterministic.
|
||||
def bytes_saved(item):
|
||||
w, n = item
|
||||
return -((len(w) + 1) * (n - 1)), w
|
||||
|
||||
top128 = sorted(topn.items(), key=bytes_saved)[:128]
|
||||
|
||||
index = [w for w, _ in top128]
|
||||
index_lookup = {w: i for i, w in enumerate(index)}
|
||||
|
||||
for line in error_strings.keys():
|
||||
result = ""
|
||||
need_space = False
|
||||
for word in line.split(" "):
|
||||
if word in index_lookup:
|
||||
result += "\\{:03o}".format(0b10000000 | index_lookup[word])
|
||||
need_space = False
|
||||
else:
|
||||
if need_space:
|
||||
result += " "
|
||||
need_space = True
|
||||
result += word
|
||||
error_strings[line] = result.strip()
|
||||
|
||||
return "".join(w[:-1] + "\\{:03o}".format(0b10000000 | ord(w[-1])) for w in index)
|
||||
|
||||
|
||||
# Replace chars in text with variable length bit sequence.
|
||||
# For comparison only (the table is not emitted).
|
||||
def huffman_compression(error_strings):
|
||||
# https://github.com/tannewt/huffman
|
||||
import huffman
|
||||
|
||||
all_strings = "".join(error_strings)
|
||||
cb = huffman.codebook(collections.Counter(all_strings).items())
|
||||
|
||||
for line in error_strings:
|
||||
b = "1"
|
||||
for c in line:
|
||||
b += cb[c]
|
||||
n = len(b)
|
||||
if n % 8 != 0:
|
||||
n += 8 - (n % 8)
|
||||
result = ""
|
||||
for i in range(0, n, 8):
|
||||
result += "\\{:03o}".format(int(b[i : i + 8], 2))
|
||||
if len(result) > len(line) * 4:
|
||||
result = line
|
||||
error_strings[line] = result
|
||||
|
||||
# TODO: This would be the prefix lengths and the table ordering.
|
||||
return "_" * (10 + len(cb))
|
||||
|
||||
|
||||
# Replace common N-letter sequences with <0x80 | index>, where
|
||||
# the common sequences are stored in a separate table.
|
||||
# This isn't very useful, need a smarter way to find top-ngrams.
|
||||
def ngram_compression(error_strings):
|
||||
topn = collections.Counter()
|
||||
N = 2
|
||||
|
||||
for line in error_strings.keys():
|
||||
check_non_ascii(line)
|
||||
if len(line) < N:
|
||||
continue
|
||||
for i in range(0, len(line) - N, N):
|
||||
topn[line[i : i + N]] += 1
|
||||
|
||||
def bytes_saved(item):
|
||||
w, n = item
|
||||
return -(len(w) * (n - 1))
|
||||
|
||||
top128 = sorted(topn.items(), key=bytes_saved)[:128]
|
||||
|
||||
index = [w for w, _ in top128]
|
||||
index_lookup = {w: i for i, w in enumerate(index)}
|
||||
|
||||
for line in error_strings.keys():
|
||||
result = ""
|
||||
for i in range(0, len(line) - N + 1, N):
|
||||
word = line[i : i + N]
|
||||
if word in index_lookup:
|
||||
result += "\\{:03o}".format(0b10000000 | index_lookup[word])
|
||||
else:
|
||||
result += word
|
||||
if len(line) % N != 0:
|
||||
result += line[len(line) - len(line) % N :]
|
||||
error_strings[line] = result.strip()
|
||||
|
||||
return "".join(index)
|
||||
|
||||
|
||||
def main(collected_path, fn):
|
||||
error_strings = collections.OrderedDict()
|
||||
max_uncompressed_len = 0
|
||||
num_uses = 0
|
||||
|
||||
# Read in all MP_ERROR_TEXT strings.
|
||||
with open(collected_path, "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
num_uses += 1
|
||||
error_strings[line] = None
|
||||
max_uncompressed_len = max(max_uncompressed_len, len(line))
|
||||
|
||||
# So that objexcept.c can figure out how big the buffer needs to be.
|
||||
print("#define MP_MAX_UNCOMPRESSED_TEXT_LEN ({})".format(max_uncompressed_len))
|
||||
|
||||
# Run the compression.
|
||||
compressed_data = fn(error_strings)
|
||||
|
||||
# Print the data table.
|
||||
print('MP_COMPRESSED_DATA("{}")'.format(compressed_data))
|
||||
|
||||
# Print the replacements.
|
||||
for uncomp, comp in error_strings.items():
|
||||
if uncomp == comp:
|
||||
prefix = ""
|
||||
else:
|
||||
prefix = "\\{:03o}".format(_COMPRESSED_MARKER)
|
||||
print('MP_MATCH_COMPRESSED("{}", "{}{}")'.format(uncomp, prefix, comp))
|
||||
|
||||
# Used to calculate the "true" length of the (escaped) compressed strings.
|
||||
def unescape(s):
|
||||
return re.sub(r"\\\d\d\d", "!", s)
|
||||
|
||||
# Stats. Note this doesn't include the cost of the decompressor code.
|
||||
uncomp_len = sum(len(s) + 1 for s in error_strings.keys())
|
||||
comp_len = sum(1 + len(unescape(s)) + 1 for s in error_strings.values())
|
||||
data_len = len(compressed_data) + 1 if compressed_data else 0
|
||||
print("// Total input length: {}".format(uncomp_len))
|
||||
print("// Total compressed length: {}".format(comp_len))
|
||||
print("// Total data length: {}".format(data_len))
|
||||
print("// Predicted saving: {}".format(uncomp_len - comp_len - data_len))
|
||||
|
||||
# Somewhat meaningless comparison to zlib/gzip.
|
||||
all_input_bytes = "\\0".join(error_strings.keys()).encode()
|
||||
print()
|
||||
if hasattr(gzip, "compress"):
|
||||
gzip_len = len(gzip.compress(all_input_bytes)) + num_uses * 4
|
||||
print("// gzip length: {}".format(gzip_len))
|
||||
print("// Percentage of gzip: {:.1f}%".format(100 * (comp_len + data_len) / gzip_len))
|
||||
if hasattr(zlib, "compress"):
|
||||
zlib_len = len(zlib.compress(all_input_bytes)) + num_uses * 4
|
||||
print("// zlib length: {}".format(zlib_len))
|
||||
print("// Percentage of zlib: {:.1f}%".format(100 * (comp_len + data_len) / zlib_len))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1], word_compression)
|
Reference in New Issue
Block a user