From 3a81d00c314c17cea05d773931c9053848135f28 Mon Sep 17 00:00:00 2001 From: Jonathan Dieter Date: Thu, 9 Aug 2018 13:42:24 +0200 Subject: [PATCH] Add contrib program to generate dictionary Signed-off-by: Jonathan Dieter --- contrib/gen_xml_dictionary | 51 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100755 contrib/gen_xml_dictionary diff --git a/contrib/gen_xml_dictionary b/contrib/gen_xml_dictionary new file mode 100755 index 0000000..e69bcac --- /dev/null +++ b/contrib/gen_xml_dictionary @@ -0,0 +1,51 @@ +#!/usr/bin/python3 + +import re +import sys +import os.path +import os +import subprocess +import argparse +import shutil +import tempfile +import shutil + +parser = argparse.ArgumentParser(description="Creates a zstd dictionary from a file that will be chunked") +parser.add_argument("split_string", help="String to use to split the file(s)") +parser.add_argument("file", nargs="+", help="File(s) to use to generate the dictionary") +parser.add_argument("-s", "--size", action="store", type=int, default=112640, help="Dictionary size") +args = parser.parse_args() + +temp_dir = tempfile.mkdtemp() + +# Match any series of hex numbers that are 32 bytes or longer +checksum_regex = re.compile("[0-9a-f]{32,}") +try: + dict_file = os.path.basename(args.file[0]).split(".")[0] + ".dict" +except KeyError: + dict_file = os.path.basename(args.file[0]) + ".dict" +for fn in args.file: + f = open(fn, 'r') + data = f.read() + f.close() + data = checksum_regex.sub("", data) + data_list = data.split(args.split_string) + count = 0 + for data in data_list: + filename = "%s/%s.%06i" % (temp_dir, os.path.basename(fn), count) + f = open(filename, 'w') + f.write(args.split_string) + f.write(data) + f.close() + count += 1 + +filelist = os.listdir(temp_dir) +filelist = ["%s/%s" % (temp_dir, f) for f in filelist] +run_cmd = ["zstd", "--train"] + filelist + ["-o", dict_file, "--maxdict=%i" % args.size] +try: + subprocess.run(run_cmd) + shutil.rmtree(temp_dir) +except subprocess.CalledProcessError: + shutil.rmtree(temp_dir) + sys.exit(1) + -- 2.30.2