Set new minimum and maximum chunk sizes for both automatic and manual
authorJonathan Dieter <jdieter@gmail.com>
Wed, 25 Jul 2018 22:48:13 +0000 (23:48 +0100)
committerJonathan Dieter <jdieter@gmail.com>
Wed, 25 Jul 2018 22:48:52 +0000 (23:48 +0100)
chunking

Signed-off-by: Jonathan Dieter <jdieter@gmail.com>
include/zck.h.in
src/lib/comp/comp.c
src/lib/zck.c
src/lib/zck_private.h

index 1a75fc29baae843e0fb0c2c1de4a801b2e4570e0..557d2df38e9b37d9b935b06b032d304c4b147b39 100644 (file)
@@ -27,6 +27,8 @@ typedef enum zck_ioption {
     ZCK_VAL_HEADER_LENGTH,      /* Set what the header length *should* be */
     ZCK_COMP_TYPE = 100,        /* Set compression type using zck_comp */
     ZCK_MANUAL_CHUNK,           /* Disable auto-chunking */
+    ZCK_CHUNK_MIN,              /* Minimum chunk size when manual chunking */
+    ZCK_CHUNK_MAX,              /* Maximum chunk size when manual chunking */
     ZCK_ZSTD_COMP_LEVEL = 1000  /* Set zstd compression level */
 } zck_ioption;
 
index 8f8bdf88e93612e0413996552311218731210f57..76824667854a74835969aa245005493d784f6678 100644 (file)
@@ -47,6 +47,14 @@ const static char *COMP_NAME[] = {
     "zstd"
 };
 
+static void update_buzhash_bits(zckCtx *zck) {
+    int s=1;
+    for(int i=0; i<zck->buzhash_match_bits; i++)
+        s *= 2;
+    s -= 1;
+    zck->buzhash_bitmask = s;
+}
+
 static int set_comp_type(zckCtx *zck, ssize_t type) {
     VALIDATE_BOOL(zck);
 
@@ -136,9 +144,6 @@ static ssize_t comp_end_dchunk(zckCtx *zck, int use_dict, size_t fd_size) {
 static ssize_t comp_write(zckCtx *zck, const char *src, const size_t src_size) {
     VALIDATE_WRITE_INT(zck);
 
-    if(!zck->comp.started && !comp_init(zck))
-        return -1;
-
     if(src_size == 0)
         return 0;
 
@@ -177,6 +182,37 @@ int comp_init(zckCtx *zck) {
             zck_comp_name_from_type(comp->type));
     if(!zck->comp.init(zck, &(zck->comp)))
         return False;
+    if(zck->mode == ZCK_MODE_WRITE) {
+        if(zck->chunk_min_size == 0) {
+            zck->chunk_min_size = CHUNK_DEFAULT_MIN;
+            zck_log(ZCK_LOG_DEBUG, "Using default minimum chunk size of %lu",
+                    zck->chunk_min_size);
+        }
+        if(zck->chunk_max_size == 0) {
+            zck->chunk_max_size = CHUNK_DEFAULT_MAX;
+            zck_log(ZCK_LOG_DEBUG, "Using default maximum chunk size of %lu",
+                    zck->chunk_max_size);
+        }
+        if(zck->manual_chunk == 0) {
+            zck_log(ZCK_LOG_DEBUG, "Using buzhash algorithm for chunking");
+            zck->buzhash_width = DEFAULT_BUZHASH_WIDTH;
+            zck->buzhash_match_bits = DEFAULT_BUZHASH_BITS;
+            update_buzhash_bits(zck);
+            zck_log(ZCK_LOG_DEBUG, "Setting average chunk size to %lu",
+                    zck->buzhash_bitmask + 1);
+            zck->chunk_auto_min = (zck->buzhash_bitmask + 1) / 4;
+            if(zck->chunk_auto_min < zck->chunk_min_size)
+                zck->chunk_auto_min = zck->chunk_min_size;
+            zck_log(ZCK_LOG_DEBUG, "Setting automatic minimum chunk size to %lu",
+                    zck->chunk_auto_min);
+            zck->chunk_auto_max = (zck->buzhash_bitmask + 1) * 4;
+            if(zck->chunk_auto_max > zck->chunk_max_size)
+                zck->chunk_auto_max = zck->chunk_max_size;
+            zck_log(ZCK_LOG_DEBUG, "Setting automatic maximum chunk size to %lu",
+                    zck->chunk_auto_max);
+        }
+    }
+
     if(zck->temp_fd) {
         if(zck->comp.dict) {
             char *dst = NULL;
@@ -273,6 +309,37 @@ int comp_ioption(zckCtx *zck, zck_ioption option, ssize_t value) {
             zck_log(ZCK_LOG_DEBUG, "Enabling automatic chunking");
             zck->manual_chunk = 0;
         }
+        return True;
+
+    /* Minimum chunk size */
+    } else if(option == ZCK_CHUNK_MIN) {
+        VALIDATE_WRITE_BOOL(zck);
+        if(value < 1) {
+            set_error(zck, "Minimum chunk size must be > 0");
+            return False;
+        }
+        if(value > zck->chunk_max_size) {
+            set_error(zck, "Minimum chunk size must be <= maximum chunk size");
+            return False;
+        }
+        zck->chunk_min_size = value;
+        zck_log(ZCK_LOG_DEBUG, "Setting minimum chunk size to %li", value);
+        return True;
+
+    /* Maximum chunk size */
+    } else if(option == ZCK_CHUNK_MAX) {
+        VALIDATE_WRITE_BOOL(zck);
+        if(value < 1) {
+            set_error(zck, "Maximum chunk size must be > 0");
+            return False;
+        }
+        if(value < zck->chunk_min_size) {
+            set_error(zck, "Maximum chunk size must be >= minimum chunk size");
+            return False;
+        }
+        zck->chunk_max_size = value;
+        zck_log(ZCK_LOG_DEBUG, "Setting maximum chunk size to %li", value);
+        return True;
 
     } else {
         if(zck && zck->comp.set_parameter)
@@ -436,7 +503,7 @@ ssize_t comp_read(zckCtx *zck, char *dst, size_t dst_size, int use_dict) {
         if(rb < 0)
             goto read_error;
         if(rb < rs) {
-            zck_log(ZCK_LOG_DEBUG, "EOF");
+            zck_log(ZCK_LOG_DDEBUG, "EOF");
             finished_rd = True;
         }
         if(!hash_update(zck, &(zck->check_full_hash), src, rb) ||
@@ -463,30 +530,70 @@ const char PUBLIC *zck_comp_name_from_type(int comp_type) {
 }
 
 ssize_t PUBLIC zck_write(zckCtx *zck, const char *src, const size_t src_size) {
-    if(zck->manual_chunk)
-        return comp_write(zck, src, src_size);
+    VALIDATE_WRITE_INT(zck);
+
+    zck_log(ZCK_LOG_DDEBUG, "Starting up");
+
+    if(src_size == 0)
+        return 0;
+
+    if(!zck->comp.started && !comp_init(zck))
+        return -1;
+
+    zck_log(ZCK_LOG_DDEBUG, "Starting up");
 
     const char *loc = src;
     size_t loc_size = src_size;
-    for(size_t i=0; i<loc_size; ) {
-        if((buzhash_update(&(zck->buzhash), loc+i, zck->buzhash_width) &
-            zck->buzhash_bitmask) == 0) {
-            if(comp_write(zck, loc, i) != i)
+    size_t loc_written = 0;
+    zck_log(ZCK_LOG_DDEBUG, "Size: %lu", zck->comp.dc_data_size + loc_size);
+
+    if(zck->manual_chunk) {
+        while(zck->comp.dc_data_size + loc_size > zck->chunk_max_size) {
+            loc_written = zck->chunk_max_size - zck->comp.dc_data_size;
+            if(comp_write(zck, loc, loc_written) != loc_written)
                 return -1;
-            zck_log(ZCK_LOG_DEBUG, "Automatically ending chunk");
+            loc_size -= loc_written;
+            loc += loc_written;
+            zck_log(ZCK_LOG_DDEBUG,
+                    "Chunk has reached maximum size, forcing a new chunk");
             if(zck_end_chunk(zck) < 0)
                 return -1;
-            loc += i;
-            loc_size -= i;
-            i = 0;
-            buzhash_reset(&(zck->buzhash));
-        } else {
-            i++;
         }
+        if(comp_write(zck, loc, loc_size) != loc_size)
+            return -1;
+        else
+            return src_size;
+    } else {
+        for(size_t i=0; i<loc_size; ) {
+            if((buzhash_update(&(zck->buzhash), loc+i, zck->buzhash_width) &
+                zck->buzhash_bitmask) == 0 ||
+               zck->comp.dc_data_size + i >= zck->chunk_auto_max) {
+                if(comp_write(zck, loc, i) != i)
+                    return -1;
+                loc += i;
+                loc_size -= i;
+                i = 0;
+                if(zck->comp.dc_data_size >= zck->chunk_max_size)
+                    zck_log(ZCK_LOG_DDEBUG,
+                            "Chunk has reached maximum size, forcing a new "
+                            "chunk");
+                else
+                    zck_log(ZCK_LOG_DDEBUG, "Automatically ending chunk");
+                if(zck->comp.dc_data_size < zck->chunk_auto_min) {
+                    zck_log(ZCK_LOG_DDEBUG,
+                            "Chunk too small, refusing to end chunk");
+                    continue;
+                }
+                if(zck_end_chunk(zck) < 0)
+                    return -1;
+            } else {
+                i++;
+            }
+        }
+        if(loc_size > 0 && comp_write(zck, loc, loc_size) != loc_size)
+            return -1;
+        return src_size;
     }
-    if(loc_size > 0 && comp_write(zck, loc, loc_size) != loc_size)
-        return -1;
-    return src_size;
 }
 
 ssize_t PUBLIC zck_end_chunk(zckCtx *zck) {
@@ -495,6 +602,11 @@ ssize_t PUBLIC zck_end_chunk(zckCtx *zck) {
     if(!zck->comp.started && !comp_init(zck))
         return -1;
 
+    if(zck->comp.dc_data_size < zck->chunk_min_size) {
+        zck_log(ZCK_LOG_DDEBUG, "Chunk too small, refusing to end chunk");
+        return zck->comp.dc_data_size;
+    }
+
     buzhash_reset(&(zck->buzhash));
     /* No point in compressing empty data */
     if(zck->comp.dc_data_size == 0)
@@ -517,7 +629,7 @@ ssize_t PUBLIC zck_end_chunk(zckCtx *zck) {
         free(dst);
         return -1;
     }
-    zck_log(ZCK_LOG_DEBUG, "Finished chunk size: %lu", data_size);
+    zck_log(ZCK_LOG_DDEBUG, "Finished chunk size: %lu", data_size);
     free(dst);
     return data_size;
 }
index ff33bc679c46c876d24328f5043c1b9d20af82cc..01d770651638f9963b97d0e894daa5fb1159c853 100644 (file)
@@ -108,14 +108,6 @@ static char *ascii_checksum_to_bin (zckCtx *zck, char *checksum) {
     return raw_checksum;
 }
 
-static void update_buzhash_bits(zckCtx *zck) {
-    int s=1;
-    for(int i=0; i<zck->buzhash_match_bits; i++)
-        s *= 2;
-    s -= 1;
-    zck->buzhash_bitmask = s;
-}
-
 int get_tmp_fd(zckCtx *zck) {
     VALIDATE_BOOL(zck);
 
@@ -335,9 +327,6 @@ zckCtx PUBLIC *zck_create() {
     }
     zck->prep_hash_type = -1;
     zck->prep_hdr_size = -1;
-    zck->buzhash_width = DEFAULT_BUZHASH_WIDTH;
-    zck->buzhash_match_bits = DEFAULT_BUZHASH_BITS;
-    update_buzhash_bits(zck);
     return zck;
 }
 
index 25cc8c3493c2420fe86da9fd7a01e8d3aa3c4a48..de569c420d3dbb93be83a35a241aa4821e128759 100644 (file)
@@ -15,6 +15,8 @@
 
 #define DEFAULT_BUZHASH_WIDTH 48
 #define DEFAULT_BUZHASH_BITS 15
+#define CHUNK_DEFAULT_MIN 1
+#define CHUNK_DEFAULT_MAX 10485760 // 10MB
 
 #define zmalloc(x) calloc(1, x)
 
@@ -276,6 +278,10 @@ typedef struct zckCtx {
     int buzhash_width;
     int buzhash_match_bits;
     int buzhash_bitmask;
+    int chunk_auto_min;
+    int chunk_auto_max;
+    int chunk_min_size;
+    int chunk_max_size;
     int manual_chunk;
 
     char *msg;