Add support for detached headers

author Jonathan Dieter <jdieter@gmail.com>

Sat, 5 Feb 2022 17:02:13 +0000 (17:02 +0000)

committer Jonathan Dieter <jdieter@gmail.com>

Sun, 20 Feb 2022 16:46:39 +0000 (16:46 +0000)
author Jonathan Dieter <jdieter@gmail.com>
Sat, 5 Feb 2022 17:02:13 +0000 (17:02 +0000)
committer Jonathan Dieter <jdieter@gmail.com>
Sun, 20 Feb 2022 16:46:39 +0000 (16:46 +0000)
diff --git a/include/zck.h.in b/include/zck.h.in

index 7bf320b5b790e5762368b5a3660ab430ccf61700..93f4706802f88701eeea7f1ad646585d65f74a50 100644 (file)
--- a/include/zck.h.in
+++ b/include/zck.h.in
@@ -226,6 +226,9 @@ char ZCK_PUBLIC_API *zck_get_header_digest(zckCtx *zck)
  /* Get data digest */
  char ZCK_PUBLIC_API *zck_get_data_digest(zckCtx *zck)
      ZCK_WARN_UNUSED;
+/* Get whether this context is pointing to a detached header */
+bool ZCK_PUBLIC_API zck_is_detached_header(zckCtx *zck)
+    ZCK_WARN_UNUSED;
  
  
  /*******************************************************************
diff --git a/src/lib/hash/hash.c b/src/lib/hash/hash.c

index 9278ec7f58164ca87052bba19e3ba00aac7a3f03..f0ebc6f893d668086a19a7a534acecb21235ca9a 100644 (file)
--- a/src/lib/hash/hash.c
+++ b/src/lib/hash/hash.c
@@ -93,6 +93,8 @@ static int validate_checksums(zckCtx *zck, zck_log_type bad_checksums) {
      for(zckChunk *idx = zck->index.first; idx; idx = idx->next) {
          if(idx == zck->index.first && idx->length == 0) {
              idx->valid = 1;
+            if(zck->header_only)
+                break;
              continue;
          }
  
@@ -120,18 +122,28 @@ static int validate_checksums(zckCtx *zck, zck_log_type bad_checksums) {
          idx->valid = valid_chunk;
          if(all_good && valid_chunk != 1)
              all_good = false;
+        if(zck->header_only)
+            break;
      }
      int valid_file = -1;
-    if(all_good) {
-        /* Check data checksum */
-        valid_file = validate_file(zck, bad_checksums);
-        if(!valid_file)
-            return 0;
+    if(zck->has_uncompressed_source || zck->header_only) {
+        /* If we have an uncompressed source or are a detached header,
+         * skip meaningless full data checksum, and just set valid_file
+         * if the chunks (or dictionary, if we're a header) was good */
+        if(all_good)
+            valid_file = 1;
+    } else {
+        if(all_good) {
+            /* Check data checksum */
+            valid_file = validate_file(zck, bad_checksums);
+            if(!valid_file)
+                return 0;
  
-        /* If data checksum failed, invalidate *all* chunks */
-        if(valid_file == -1)
-            for(zckChunk *idx = zck->index.first; idx; idx = idx->next)
-                idx->valid = -1;
+            /* If data checksum failed, invalidate *all* chunks */
+            if(valid_file == -1)
+                for(zckChunk *idx = zck->index.first; idx; idx = idx->next)
+                    idx->valid = -1;
+        }
      }
  
      /* Go back to beginning of data section */
@@ -473,16 +485,15 @@ int validate_header(zckCtx *zck) {
      return 1;
  }
  
-/* Returns 1 if data hash matches, -1 if it doesn't and 0 if error */
+/* Returns 1 if data hash matches, -1 if it doesn't and 0 if error
+ *
+ * For a zchunk file with both compressed and uncompressed checksums, validate
+ * each chunk checksum independently, since there is no data hash */
  int ZCK_PUBLIC_API zck_validate_data_checksum(zckCtx *zck) {
      VALIDATE_READ_BOOL(zck);
  
      if(zck->has_uncompressed_source) {
-        zck_log(
-            ZCK_LOG_DEBUG,
-            "Skipping full file validation since uncompressed source flag is set"
-        );
-        return 1;
+        return validate_checksums(zck, ZCK_LOG_WARNING);
      }
  
      if(!seek_data(zck, zck->data_offset, SEEK_SET))
diff --git a/src/lib/header.c b/src/lib/header.c

index 730e663e30242de7f75b131a7c9062a73dc5b28f..16ea3e8d783884589d677d1530ab1bcb616b7f8b 100644 (file)
--- a/src/lib/header.c
+++ b/src/lib/header.c
@@ -74,6 +74,12 @@ static bool read_optional_element(zckCtx *zck, size_t id, size_t data_size,
  }
  
  static bool read_header_from_file(zckCtx *zck) {
+    /* Verify that lead_size and header_length have been set */
+    if(zck->lead_size == 0 || zck->header_length == 0) {
+        set_error(zck, "Lead and header sizes are both 0.  Have you run zck_read_lead() yet?");
+        return false;
+    }
+
      /* Allocate header and store any extra bytes at beginning of header */
      zck->header = zrealloc(zck->header, zck->lead_size + zck->header_length);
      if (!zck->header) {
@@ -102,9 +108,16 @@ static bool read_header_from_file(zckCtx *zck) {
  
      if(!hash_init(zck, &(zck->check_full_hash), &(zck->hash_type)))
          return false;
-    if(!hash_update(zck, &(zck->check_full_hash), zck->header,
-                    zck->hdr_digest_loc))
+    /* If we're reading a detached zchunk header, first five bytes will be
+     * different, breaking the header digest, so let's make things simple
+     * by forcing the first five bytes to be static */
+    if(!hash_update(zck, &(zck->check_full_hash), "\0ZCK1", 5))
          return false;
+    /* Now hash the remaining lead */
+    if(!hash_update(zck, &(zck->check_full_hash), zck->header+5,
+                    zck->hdr_digest_loc-5))
+        return false;
+    /* And the remaining header */
      if(!hash_update(zck, &(zck->check_full_hash), header, zck->header_length))
          return false;
      int ret = validate_header(zck);
@@ -472,7 +485,9 @@ static bool read_lead(zckCtx *zck) {
          return false;
      }
  
-    if(memcmp(header, "\0ZCK1", 5) != 0) {
+    if(memcmp(header, "\0ZHR1", 5) == 0) {
+        zck->header_only = true;
+    } else if(memcmp(header, "\0ZCK1", 5) != 0) {
          free(header);
          set_error(zck, "Invalid lead, perhaps this is not a zck file?");
          return false;
@@ -652,3 +667,8 @@ ssize_t ZCK_PUBLIC_API zck_get_flags(zckCtx *zck) {
      VALIDATE_INT(zck);
      return get_flags(zck);
  }
+
+bool ZCK_PUBLIC_API zck_is_detached_header(zckCtx *zck) {
+    VALIDATE_BOOL(zck);
+    return zck->header_only;
+}
diff --git a/src/lib/log.c b/src/lib/log.c

index 42119f73207074cfdfa162b3a9df736315eb0ebe..637e6780462c4e303a9a719d155541048e77b70f 100644 (file)
--- a/src/lib/log.c
+++ b/src/lib/log.c
@@ -40,7 +40,7 @@ static int log_fd = 2;
  static int log_fd = STDERR_FILENO;
  #endif
  
-static logcallback callback = NULL; 
+static logcallback callback = NULL;
  
  void ZCK_PUBLIC_API zck_set_log_level(zck_log_type ll) {
      log_level = ll;
diff --git a/src/lib/zck_private.h b/src/lib/zck_private.h

index 77a8c5fde1fee3847f43ebe1ac988503165c8176..e91cf90de551baca8f9b45fb5d5ebd035cdab018 100644 (file)
--- a/src/lib/zck_private.h
+++ b/src/lib/zck_private.h
@@ -245,6 +245,7 @@ struct zckCtx {
      size_t data_offset;
      size_t header_length;
  
+    bool header_only;
      char *header;
      size_t header_size;
      size_t hdr_digest_loc;
@@ -257,7 +258,6 @@ struct zckCtx {
      char *sig_string;
      size_t sig_size;
  
-
      char *prep_digest;
      int prep_hash_type;
      ssize_t prep_hdr_size;
diff --git a/src/unzck.c b/src/unzck.c

index fd4af6f060742475ff7379a0a76969d753aa93bc..6b992ccf35f308f993ffc3fc3e8b0ecafafbb491 100644 (file)
--- a/src/unzck.c
+++ b/src/unzck.c
@@ -53,7 +53,8 @@ static struct argp_option options[] = {
      {"verbose", 'v', 0,        0,
       "Increase verbosity (can be specified more than once for debugging)"},
      {"stdout",  'c', 0,        0, "Direct output to stdout"},
-    {"dict",   1000, 0,        0, "Only extract the dictionary"},
+    {"dict",   1000, 0,        0, "Only extract the dictionary (can't be run with --header)"},
+    {"header", 1001, 0,        0, "Only extract the header (can't be run with --dict)"},
      {"version", 'V', 0,        0, "Show program version"},
      { 0 }
  };
@@ -62,6 +63,7 @@ struct arguments {
    char *args[1];
    zck_log_type log_level;
    bool dict;
+  bool header;
    bool std_out;
    bool exit;
  };
@@ -85,8 +87,13 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state) {
              version();
              arguments->exit = true;
              break;
-        case 1000:
+        case 1000: // Header and dict can't both be set
              arguments->dict = true;
+            arguments->header = false;
+            break;
+        case 1001: // Header and dict can't both be set
+            arguments->header = true;
+            arguments->dict = false;
              break;
          case ARGP_KEY_ARG:
              if (state->arg_num >= 1) {
@@ -126,8 +133,9 @@ int main (int argc, char *argv[]) {
  
      if(!arguments.std_out) {
          if(strlen(arguments.args[0]) < 5 ||
-           strcmp(arguments.args[0] + strlen(arguments.args[0]) - 4, ".zck") != 0) {
-            LOG_ERROR("Not a *.zck file: %s\n", arguments.args[0]);
+           (strcmp(arguments.args[0] + strlen(arguments.args[0]) - 4, ".zck") != 0 &&
+            strcmp(arguments.args[0] + strlen(arguments.args[0]) - 4, ".zhr") != 0)) {
+            LOG_ERROR("Not a *.zck or *.zhr file: %s\n", arguments.args[0]);
              exit(1);
          }
      }
@@ -141,12 +149,16 @@ int main (int argc, char *argv[]) {
      char *out_name = NULL;
      if(arguments.dict)
          out_name = calloc(strlen(base_name) + 3, 1); // len .zck -> .zdict = +2
+    else if(arguments.header)
+        out_name = calloc(strlen(base_name), 1); // .zck -> zhr
      else
-        out_name = calloc(strlen(base_name) - 2, 1);
+        out_name = calloc(strlen(base_name) - 2, 1); // strip .zck
      assert(out_name);
      snprintf(out_name, strlen(base_name) - 3, "%s", base_name); //Strip off .zck
      if(arguments.dict)
          snprintf(out_name + strlen(base_name) - 4, 7, ".zdict");
+    else if(arguments.header)
+        snprintf(out_name + strlen(base_name) - 4, 5, ".zhr");
  
  #ifdef _WIN32
      int dst_fd = _fileno(stdout);
@@ -179,6 +191,9 @@ int main (int argc, char *argv[]) {
          if(dict_size < 0) {
              LOG_ERROR("%s", zck_get_error(zck));
              goto error2;
+        } else if(dict_size == 0) {
+            LOG_ERROR("%s doesn't contain a dictionary\n", arguments.args[0]);
+            goto error2;
          }
          data = calloc(dict_size, 1);
          assert(data);
@@ -208,7 +223,58 @@ int main (int argc, char *argv[]) {
          }
          good_exit = true;
          goto error2;
+    } else if(arguments.header) {
+        if(zck_is_detached_header(zck)) {
+            LOG_ERROR("%s is already a detached header\n", arguments.args[0]);
+            goto error2;
+        }
+
+        ssize_t header_size = zck_get_header_length(zck);
+        if(header_size == -1) {
+            LOG_ERROR("%s", zck_get_error(zck));
+            goto error2;
+        }
+
+        zckChunk *dict = zck_get_first_chunk(zck);
+        ssize_t dict_size = zck_get_chunk_comp_size(dict);
+        if(dict_size < 0) {
+            LOG_ERROR("%s", zck_get_error(zck));
+            goto error2;
+        }
+
+        data = calloc(BUF_SIZE, 1);
+        if(data == NULL) {
+            LOG_ERROR("Unable to allocate %i bytes\n", BUF_SIZE);
+            goto error2;
+        }
+
+        if(lseek(src_fd, 5, SEEK_SET) < 0) {
+            perror("Unable to seek to beginning of source file");
+            exit(1);
+        }
+        write(dst_fd, "\0ZHR1", 5);
+        for(ssize_t i=5; i<header_size + dict_size; i+=BUF_SIZE) {
+            ssize_t write_size = i + BUF_SIZE < header_size + dict_size ? BUF_SIZE : header_size + dict_size - i;
+            ssize_t read_size = read(src_fd, data, write_size);
+            if(read_size < write_size) {
+                LOG_ERROR("Unable to read %llu bytes from source\n", (long long unsigned) write_size);
+                goto error2;
+            }
+            if(write(dst_fd, data, write_size) != write_size) {
+                LOG_ERROR("Error writing to %s\n", out_name);
+                goto error2;
+            }
+        }
+        good_exit = true;
+        goto error2;
      }
+
+    if(zck_is_detached_header(zck)) {
+        LOG_ERROR("%s is a detached header, not a full zchunk file.  The only operation unzck\n"
+                  "can run on a detached header is --dict\n", arguments.args[0]);
+        goto error2;
+    }
+
      int ret = zck_validate_data_checksum(zck);
      if(ret < 1) {
          if(ret == -1)
@@ -220,18 +286,18 @@ int main (int argc, char *argv[]) {
      assert(data);
      size_t total = 0;
      while(true) {
-        ssize_t read = zck_read(zck, data, BUF_SIZE);
-        if(read < 0) {
+        ssize_t read_size = zck_read(zck, data, BUF_SIZE);
+        if(read_size < 0) {
              LOG_ERROR("%s", zck_get_error(zck));
              goto error2;
          }
-        if(read == 0)
+        if(read_size == 0)
              break;
-        if(write(dst_fd, data, read) != read) {
+        if(write(dst_fd, data, read_size) != read_size) {
              LOG_ERROR("Error writing to %s\n", out_name);
              goto error2;
          }
-        total += read;
+        total += read_size;
      }
      if(!zck_close(zck)) {
          LOG_ERROR("%s", zck_get_error(zck));
diff --git a/src/zck_read_header.c b/src/zck_read_header.c

index fe7bf62e620213d4f189b2f7a955430de3f54d80..091cdaa848d3cce07a3dcbdf423ebecbd1421ec6 100644 (file)
--- a/src/zck_read_header.c
+++ b/src/zck_read_header.c
@@ -159,6 +159,10 @@ int main (int argc, char *argv[]) {
              LOG_ERROR("%s", zck_get_error(zck));
              exit(1);
          }
+        if(zck_is_detached_header(zck))
+            printf("zchunk detached header\n\n");
+        else
+            printf("zchunk file\n\n");
          printf("Overall checksum type: %s\n",
                 zck_hash_name_from_type(zck_get_full_hash_type(zck)));
          printf("Header size: %llu\n", (long long unsigned) zck_get_header_length(zck));
@@ -221,6 +225,8 @@ int main (int argc, char *argv[]) {
              if(arguments.verify) {
                  if(zck_get_chunk_valid(chk) == 1)
                      printf("  +");
+                else if(zck_is_detached_header(zck) && zck_get_chunk_valid(chk) == 0)
+                    ;
                  else
                      printf("  !");
              }
diff --git a/zchunk_format.txt b/zchunk_format.txt

index d1dfe480b778a6b1efc126f0184619fbb6940b40..7634fb6bd98094437513a5068ceda0c0311cfe7a 100644 (file)
--- a/zchunk_format.txt
+++ b/zchunk_format.txt
@@ -27,6 +27,8 @@ The lead:
  
  ID
   '\0ZCK1', identifies file as zchunk version 1 file
+ OR
+ '\0ZHR1', identifies file as zchunk detached header version 1 file
  
  Checksum type
   This is an integer containing the type of checksum used to generate the header
@@ -46,7 +48,9 @@ Header size:
  
  Header checksum
   This is the checksum of everything from the beginning of the file until the end
- of the signatures, ignoring the header checksum.
+ of the signatures, ignoring the header checksum.  For detached headers,
+ libraries should use '\0ZCK1' for the ID when calculating the checksum so it
+ matches the full zchunk file
  
  
  The preface:
@@ -119,13 +123,13 @@ The index:
  +=================+==========================+==================+
  
  (Dict stream will only exist if flag 0 is set to 1)
-+======================+===============+==================+
-| Dict stream (ci) [0] | Dict checksum | Dict length (ci) |
-+======================+===============+==================+
++======================+===============+================================+
+| Dict stream (ci) [0] | Dict checksum | Uncompressed dict checksum [2] |
++======================+===============+================================+
  
-+===============================+
-| Uncompressed dict length (ci) |
-+===============================+
++==================+===============================+
+| Dict length (ci) | Uncompressed dict length (ci) |
++==================+===============================+
  
  [+=======================+================+=================================+
  [| Chunk stream (ci) [0] | Chunk checksum | Uncompressed chunk checksum [2] |
@@ -161,6 +165,11 @@ Dict checksum
   This is the checksum of the compressed dict, used to detect whether two dicts
   are identical.  If there is no dict, the checksum must be all zeros.
  
+NOTE: Uncompressed dict checksum will only exist if flag 2 is set to 1
+Uncompressed dict checksum
+ This is the checksum of the uncompressed dictionary, but really has no use as
+ the uncompressed source won't have a dictionary
+
  Dict length
   This is an integer containing the length of the dict.  If there is no dict,
   this must be a zero.
@@ -227,8 +236,11 @@ Signature size
   This is an integer containing the size of the signature.
  
  Signature
- The actual signature.  The signature MUST only apply to the header, excluding
- the header size, the header checksum, the signature count and the signatures.
+ The actual signature.  The signature MUST only apply to the header, excluding:
+  * The header size
+  * The header checksum
+  * The signature count
+  * The signatures
   The excluded data MUST be omitted when calculating the signature.
  
  Signatures are designed so that anyone can add a new signature to a file
author	Jonathan Dieter <jdieter@gmail.com>
	Sat, 5 Feb 2022 17:02:13 +0000 (17:02 +0000)
committer	Jonathan Dieter <jdieter@gmail.com>
	Sun, 20 Feb 2022 16:46:39 +0000 (16:46 +0000)
include/zck.h.in		patch \| blob \| history
src/lib/hash/hash.c		patch \| blob \| history
src/lib/header.c		patch \| blob \| history
src/lib/log.c		patch \| blob \| history
src/lib/zck_private.h		patch \| blob \| history
src/unzck.c		patch \| blob \| history
src/zck_read_header.c		patch \| blob \| history
zchunk_format.txt		patch \| blob \| history