From f5cb72f087c2482402826976d2e55943066a40f2 Mon Sep 17 00:00:00 2001 From: Stefano Babic Date: Fri, 16 Jul 2021 09:40:35 +0200 Subject: [PATCH] Add size of uncompressed chunk to index Size of uncompressed chunk can be used to compare original and detect if a chunk is required without having to convert it first to a zck file. Signed-off-by: Stefano Babic --- include/zck.h.in | 4 ++++ src/lib/comp/comp.c | 2 ++ src/lib/dl/range.c | 2 +- src/lib/hash/hash.c | 10 ++++++++ src/lib/header.c | 8 ++++++- src/lib/index/index_common.c | 1 + src/lib/index/index_create.c | 45 +++++++++++++++++++++++++++++++----- src/lib/index/index_read.c | 8 +++++++ src/lib/zck.c | 2 ++ src/lib/zck_private.h | 7 +++++- src/zck.c | 12 ++++++++++ src/zck_read_header.c | 7 +++++- 12 files changed, 98 insertions(+), 10 deletions(-) diff --git a/include/zck.h.in b/include/zck.h.in index b847576..a053b30 100644 --- a/include/zck.h.in +++ b/include/zck.h.in @@ -26,6 +26,7 @@ typedef enum zck_ioption { ZCK_HASH_CHUNK_TYPE, /* Set chunk hash type using zck_hash */ ZCK_VAL_HEADER_HASH_TYPE, /* Set what the header hash type *should* be */ ZCK_VAL_HEADER_LENGTH, /* Set what the header length *should* be */ + ZCK_UNCOMP_HEADER, /* Header should contain uncompressed size, too */ ZCK_COMP_TYPE = 100, /* Set compression type using zck_comp */ ZCK_MANUAL_CHUNK, /* Disable auto-chunking */ ZCK_CHUNK_MIN, /* Minimum chunk size when manual chunking */ @@ -263,6 +264,9 @@ char *zck_get_chunk_digest(zckChunk *item) /* Get digest size of chunk hash type */ ssize_t zck_get_chunk_digest_size(zckCtx *zck) __attribute__ ((warn_unused_result)); +/* Get uncompressed chunk digest */ +char *zck_get_chunk_digest_uncompressed(zckChunk *item) + __attribute__ ((warn_unused_result)); /* Get chunk data */ ssize_t zck_get_chunk_data(zckChunk *idx, char *dst, size_t dst_size) __attribute__ ((warn_unused_result)); diff --git a/src/lib/comp/comp.c b/src/lib/comp/comp.c index 89b3301..dbbbefe 100644 --- a/src/lib/comp/comp.c +++ b/src/lib/comp/comp.c @@ -158,6 +158,8 @@ static ssize_t comp_write(zckCtx *zck, const char *src, const size_t src_size) { free(dst); return -1; } + if(zck->has_uncompressed_source && !hash_update(zck, &(zck->work_index_hash_uncomp), src, src_size)) + return -1; free(dst); return src_size; } diff --git a/src/lib/dl/range.c b/src/lib/dl/range.c index e102fa9..a366f9b 100644 --- a/src/lib/dl/range.c +++ b/src/lib/dl/range.c @@ -54,7 +54,7 @@ static zckRangeItem *range_insert_new(zckCtx *zck, zckRangeItem *prev, } if(add_index) if(!index_new_chunk(zck, &(info->index), idx->digest, idx->digest_size, - end-start+1, end-start+1, idx, false)) { + idx->digest_uncompressed, end-start+1, end-start+1, idx, false)) { free(new); return NULL; } diff --git a/src/lib/hash/hash.c b/src/lib/hash/hash.c index d2b0041..25768ae 100644 --- a/src/lib/hash/hash.c +++ b/src/lib/hash/hash.c @@ -517,6 +517,16 @@ char PUBLIC *zck_get_chunk_digest(zckChunk *item) { return get_digest_string(item->digest, item->digest_size); } +char PUBLIC *zck_get_chunk_digest_uncompressed(zckChunk *item) { + if(item == NULL) + return NULL; + if (!item->zck->has_uncompressed_source) { + return NULL; + } + return get_digest_string(item->digest_uncompressed, item->digest_size_uncompressed); +} + + /* Returns 1 if all chunks are valid, -1 if even one isn't and 0 if error */ int PUBLIC zck_find_valid_chunks(zckCtx *zck) { VALIDATE_READ_BOOL(zck); diff --git a/src/lib/header.c b/src/lib/header.c index 38b587b..0d276f8 100644 --- a/src/lib/header.c +++ b/src/lib/header.c @@ -44,6 +44,10 @@ static bool check_flags(zckCtx *zck, size_t flags) { zck->has_optional_elems = flags & 2; if(zck->has_optional_elems) flags -= 2; + zck->has_uncompressed_source = flags & 4; + if(zck->has_uncompressed_source) + flags -= 4; + flags = flags & (SIZE_MAX - 1); if(flags != 0) { set_fatal_error(zck, "Unknown flags(s) set"); @@ -177,13 +181,13 @@ static bool read_index(zckCtx *zck) { } char *header = NULL; - zck_log(ZCK_LOG_DEBUG, "Reading index"); if(zck->lead_size + zck->preface_size + zck->index_size > zck->header_size) { set_fatal_error(zck, "Read past end of header"); return false; } header = zck->header + zck->lead_size + zck->preface_size; + zck_log(ZCK_LOG_DEBUG, "Reading index at 0x%x", (unsigned long)(zck->lead_size + zck->preface_size)); int max_length = zck->header_size - (zck->lead_size + zck->preface_size); if(!index_read(zck, header, zck->index_size, max_length)) return false; @@ -244,6 +248,8 @@ static bool preface_create(zckCtx *zck) { size_t flags = 0; if(zck->has_streams) flags &= 1; + if(zck->has_uncompressed_source) + flags |= 4; compint_from_size(header+length, flags, &length); /* Write out compression type and index size */ diff --git a/src/lib/index/index_common.c b/src/lib/index/index_common.c index 3456d26..b20b713 100644 --- a/src/lib/index/index_common.c +++ b/src/lib/index/index_common.c @@ -89,6 +89,7 @@ void clear_work_index(zckCtx *zck) { return; hash_close(&(zck->work_index_hash)); + hash_close(&(zck->work_index_hash_uncomp)); if(zck->work_index_item) index_free_item(&(zck->work_index_item)); } diff --git a/src/lib/index/index_create.c b/src/lib/index/index_create.c index 8cbc316..a035c84 100644 --- a/src/lib/index/index_create.c +++ b/src/lib/index/index_create.c @@ -37,22 +37,28 @@ static bool create_chunk(zckCtx *zck) { clear_work_index(zck); zck->work_index_item = zmalloc(sizeof(zckChunk)); - if(!hash_init(zck, &(zck->work_index_hash), &(zck->chunk_hash_type))) + if(!hash_init(zck, &(zck->work_index_hash), &(zck->chunk_hash_type)) || + (!hash_init(zck, &(zck->work_index_hash_uncomp), &(zck->chunk_hash_type)))) return false; return true; } static bool finish_chunk(zckIndex *index, zckChunk *item, char *digest, - bool valid, zckCtx *zck) { + char *digest_uncompressed, bool valid, zckCtx *zck) { VALIDATE_BOOL(zck); ALLOCD_BOOL(zck, index); ALLOCD_BOOL(zck, item); item->digest = zmalloc(index->digest_size); + item->digest_uncompressed = zmalloc(index->digest_size); if(digest) { memcpy(item->digest, digest, index->digest_size); item->digest_size = index->digest_size; } + if(digest_uncompressed) { + memcpy(item->digest_uncompressed, digest_uncompressed, index->digest_size); + item->digest_size_uncompressed = index->digest_size; + } item->start = index->length; item->valid = valid; item->zck = zck; @@ -65,6 +71,15 @@ static bool finish_chunk(zckIndex *index, zckChunk *item, char *digest, index->last = item; index->count += 1; index->length += item->comp_length; + + char *s = get_digest_string(digest, index->digest_size); + if (zck->has_uncompressed_source) { + char *s1 = get_digest_string(digest_uncompressed, index->digest_size); + zck_log(ZCK_LOG_DEBUG, "Index %d digest %s digest uncomp %s", index->count, s, s1); + free(s1); + } else + zck_log(ZCK_LOG_DEBUG, "Index %d digest %s", index->count, s); + free(s); return true; } @@ -88,7 +103,8 @@ bool index_create(zckCtx *zck) { if(zck->index.first) { zckChunk *tmp = zck->index.first; while(tmp) { - index_malloc += zck->index.digest_size + MAX_COMP_SIZE*2; + index_malloc += (zck->has_uncompressed_source + 1) * zck->index.digest_size + + MAX_COMP_SIZE * 2; tmp = tmp->next; } } @@ -103,6 +119,11 @@ bool index_create(zckCtx *zck) { /* Write digest */ memcpy(index+index_size, tmp->digest, zck->index.digest_size); index_size += zck->index.digest_size; + /* Write digest for uncompressed if any */ + if (zck->has_uncompressed_source) { + memcpy(index+index_size, tmp->digest_uncompressed, zck->index.digest_size); + index_size += zck->index.digest_size; + } /* Write compressed size */ compint_from_size(index+index_size, tmp->comp_length, &index_size); @@ -121,7 +142,7 @@ bool index_create(zckCtx *zck) { } bool index_new_chunk(zckCtx *zck, zckIndex *index, char *digest, - int digest_size, size_t comp_size, size_t orig_size, + int digest_size, char *digest_uncompressed, size_t comp_size, size_t orig_size, zckChunk *src, bool finished) { VALIDATE_BOOL(zck); @@ -138,7 +159,7 @@ bool index_new_chunk(zckCtx *zck, zckIndex *index, char *digest, chk->comp_length = comp_size; chk->length = orig_size; chk->src = src; - return finish_chunk(index, chk, digest, finished, zck); + return finish_chunk(index, chk, digest, digest_uncompressed, finished, zck); } bool index_add_to_chunk(zckCtx *zck, char *data, size_t comp_size, @@ -168,6 +189,7 @@ bool index_finish_chunk(zckCtx *zck) { return false; char *digest = NULL; + char *digest_uncompressed = NULL; if(zck->work_index_item->length > 0) { /* Finalize chunk checksum */ digest = hash_finalize(zck, &(zck->work_index_hash)); @@ -177,16 +199,27 @@ bool index_finish_chunk(zckCtx *zck) { zck_hash_name_from_type(zck->index.hash_type)); return false; } + digest_uncompressed = hash_finalize(zck, &(zck->work_index_hash_uncomp)); + if(digest_uncompressed == NULL) { + set_fatal_error(zck, "Unable to calculate %s checksum for new chunk", + zck_hash_name_from_type(zck->index.hash_type)); + free(digest); + return false; + } } else { digest = zmalloc(zck->chunk_hash_type.digest_size); + digest_uncompressed = zmalloc(zck->chunk_hash_type.digest_size); } - if(!finish_chunk(&(zck->index), zck->work_index_item, digest, true, zck)) { + if(!finish_chunk(&(zck->index), zck->work_index_item, digest, digest_uncompressed, true, zck)) { free(digest); + free(digest_uncompressed); return false; } free(digest); + free(digest_uncompressed); zck->work_index_item = NULL; hash_close(&(zck->work_index_hash)); + hash_close(&(zck->work_index_hash_uncomp)); return true; } diff --git a/src/lib/index/index_read.c b/src/lib/index/index_read.c index 875b569..42b4efe 100644 --- a/src/lib/index/index_read.c +++ b/src/lib/index/index_read.c @@ -85,6 +85,14 @@ bool index_read(zckCtx *zck, char *data, size_t size, size_t max_length) { new); length += zck->index.digest_size; + /* Read uncompressed entry digest, if any */ + if (zck->has_uncompressed_source) { + /* same size for digest as compressed */ + new->digest_uncompressed = zmalloc(zck->index.digest_size); + memcpy(new->digest_uncompressed, data+length, zck->index.digest_size); + new->digest_size_uncompressed = zck->index.digest_size; + length += zck->index.digest_size; + } /* Read and store entry length */ size_t chunk_length = 0; if(!compint_to_size(zck, &chunk_length, data+length, &length, diff --git a/src/lib/zck.c b/src/lib/zck.c index dece24b..d563350 100644 --- a/src/lib/zck.c +++ b/src/lib/zck.c @@ -292,6 +292,8 @@ bool PUBLIC zck_set_ioption(zckCtx *zck, zck_ioption option, ssize_t value) { } zck->prep_hdr_size = value; + } else if(option == ZCK_UNCOMP_HEADER) { + zck->has_uncompressed_source = 1; /* Hash options */ } else if(option < 100) { /* Currently no hash options other than setting hash type, so bail */ diff --git a/src/lib/zck_private.h b/src/lib/zck_private.h index a21e963..49eb4c3 100644 --- a/src/lib/zck_private.h +++ b/src/lib/zck_private.h @@ -150,6 +150,8 @@ struct zckDL { struct zckChunk { char *digest; int digest_size; + char *digest_uncompressed; + int digest_size_uncompressed; int valid; size_t number; size_t start; @@ -262,9 +264,12 @@ struct zckCtx { zckIndex index; zckChunk *work_index_item; zckHash work_index_hash; + zckChunk *work_index_item_uncomp; + zckHash work_index_hash_uncomp; size_t stream; int has_streams; int has_optional_elems; + int has_uncompressed_source; char *read_buf; size_t read_buf_size; @@ -340,7 +345,7 @@ bool index_read(zckCtx *zck, char *data, size_t size, size_t max_length) bool index_create(zckCtx *zck) __attribute__ ((warn_unused_result)); bool index_new_chunk(zckCtx *zck, zckIndex *index, char *digest, int digest_size, - size_t comp_size, size_t orig_size, zckChunk *src, bool valid) + char* digest_uncompressed, size_t comp_size, size_t orig_size, zckChunk *src, bool valid) __attribute__ ((warn_unused_result)); bool index_add_to_chunk(zckCtx *zck, char *data, size_t comp_size, size_t orig_size) diff --git a/src/zck.c b/src/zck.c index 854591c..9d8f01e 100644 --- a/src/zck.c +++ b/src/zck.c @@ -57,6 +57,8 @@ static struct argp_option options[] = { "Set zstd compression dictionary to FILE"}, {"manual-chunk", 'm', 0, 0, "Don't do any automatic chunking (implies -s)"}, + {"uncompressed", 'u', 0, 0, + "Add extension in header for uncompressed data"}, {"version", 'V', 0, 0, "Show program version"}, { 0 } }; @@ -69,6 +71,7 @@ struct arguments { char *output; char *dict; bool exit; + bool uncompressed; }; static error_t parse_opt (int key, char *arg, struct argp_state *state) { @@ -95,6 +98,9 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state) { case 'D': arguments->dict = arg; break; + case 'u': + arguments->uncompressed = true; + break; case 'V': version(); arguments->exit = true; @@ -223,6 +229,12 @@ int main (int argc, char *argv[]) { } } + if(arguments.uncompressed) { + if(!zck_set_ioption(zck, ZCK_UNCOMP_HEADER, 1)) { + dprintf(STDERR_FILENO, "%s\n", zck_get_error(zck)); + exit(1); + } + } char *data; int in_fd = open(arguments.args[0], O_RDONLY); off_t in_size = 0; diff --git a/src/zck_read_header.c b/src/zck_read_header.c index a159ea5..21e9bee 100644 --- a/src/zck_read_header.c +++ b/src/zck_read_header.c @@ -176,9 +176,14 @@ int main (int argc, char *argv[]) { dprintf(STDERR_FILENO, "%s", zck_get_error(zck)); exit(1); } - printf("%12lu %s %12lu %12lu %12lu", + char *digest_uncompressed = zck_get_chunk_digest_uncompressed(chk); + if (!digest_uncompressed) + digest_uncompressed = ""; + + printf("%12lu %s %s %12lu %12lu %12lu", (long unsigned)zck_get_chunk_number(chk), digest, + digest_uncompressed, (long unsigned)zck_get_chunk_start(chk), (long unsigned)zck_get_chunk_comp_size(chk), (long unsigned)zck_get_chunk_size(chk)); -- 2.30.2