When exporting, use hardlinks for duplicated files
authorOwen W. Taylor <otaylor@fishsoup.net>
Fri, 29 Sep 2023 16:09:04 +0000 (12:09 -0400)
committerOwen W. Taylor <otaylor@fishsoup.net>
Fri, 29 Sep 2023 17:45:29 +0000 (13:45 -0400)
For ostree_repo_export_tree_to_archive(), and 'ostree export', when the
exported tree contains multiple files with the same checksum, write an
archive with hard links.

Without this, importing a tree, then exporting it again breaks
hardlinks.

As an example of savings: this reduces the (compressed) size of the
Fedora Flatpak Runtime image from 1345MiB to 712MiB.

Resolves: #2925

src/libostree/ostree-repo-libarchive.c
tests/archive-test.sh
tests/libtest.sh
tests/test-composefs.sh
tests/test-export.sh

index d0f46883c7c141446debafdeb135b5ff60fc88b0..65a309335f55bbc62fa3d467bf1f991495422eb4 100644 (file)
@@ -943,15 +943,10 @@ ostree_repo_write_archive_to_mtree_from_fd (OstreeRepo *self, int fd, OstreeMuta
 
 #ifdef HAVE_LIBARCHIVE
 
-static gboolean
-file_to_archive_entry_common (GFile *root, OstreeRepoExportArchiveOptions *opts, GFile *path,
-                              GFileInfo *file_info, struct archive_entry *entry, GError **error)
+static char *
+file_to_pathstr (GFile *root, OstreeRepoExportArchiveOptions *opts, GFile *path)
 {
-  gboolean ret = FALSE;
   g_autofree char *pathstr = g_file_get_relative_path (root, path);
-  g_autoptr (GVariant) xattrs = NULL;
-  time_t ts = (time_t)opts->timestamp_secs;
-
   if (opts->path_prefix && opts->path_prefix[0])
     {
       g_autofree char *old_pathstr = pathstr;
@@ -964,6 +959,18 @@ file_to_archive_entry_common (GFile *root, OstreeRepoExportArchiveOptions *opts,
       pathstr = g_strdup (".");
     }
 
+  return g_steal_pointer (&pathstr);
+}
+
+static gboolean
+file_to_archive_entry_common (GFile *root, OstreeRepoExportArchiveOptions *opts, GFile *path,
+                              GFileInfo *file_info, struct archive_entry *entry, GError **error)
+{
+  gboolean ret = FALSE;
+  g_autofree char *pathstr = file_to_pathstr (root, opts, path);
+  g_autoptr (GVariant) xattrs = NULL;
+  time_t ts = (time_t)opts->timestamp_secs;
+
   archive_entry_update_pathname_utf8 (entry, pathstr);
   archive_entry_set_ctime (entry, ts, OSTREE_TIMESTAMP);
   archive_entry_set_mtime (entry, ts, OSTREE_TIMESTAMP);
@@ -1021,7 +1028,8 @@ out:
 static gboolean
 write_directory_to_libarchive_recurse (OstreeRepo *self, OstreeRepoExportArchiveOptions *opts,
                                        GFile *root, GFile *dir, struct archive *a,
-                                       GCancellable *cancellable, GError **error)
+                                       GHashTable *seen_checksums, GCancellable *cancellable,
+                                       GError **error)
 {
   gboolean ret = FALSE;
   g_autoptr (GFileInfo) dir_info = NULL;
@@ -1057,8 +1065,8 @@ write_directory_to_libarchive_recurse (OstreeRepo *self, OstreeRepoExportArchive
       /* First, handle directories recursively */
       if (g_file_info_get_file_type (file_info) == G_FILE_TYPE_DIRECTORY)
         {
-          if (!write_directory_to_libarchive_recurse (self, opts, root, path, a, cancellable,
-                                                      error))
+          if (!write_directory_to_libarchive_recurse (self, opts, root, path, a, seen_checksums,
+                                                      cancellable, error))
             goto out;
 
           /* Go to the next entry */
@@ -1086,9 +1094,27 @@ write_directory_to_libarchive_recurse (OstreeRepo *self, OstreeRepoExportArchive
             g_autoptr (GInputStream) file_in = NULL;
             g_autoptr (GFileInfo) regular_file_info = NULL;
             const char *checksum;
+            GFile *old_path;
 
             checksum = ostree_repo_file_get_checksum ((OstreeRepoFile *)path);
 
+            old_path = g_hash_table_lookup (seen_checksums, checksum);
+            if (old_path)
+              {
+                g_autofree char *old_pathstr = file_to_pathstr (root, opts, old_path);
+
+                archive_entry_set_hardlink (entry, old_pathstr);
+                if (!write_header_free_entry (a, &entry, error))
+                  goto out;
+
+                break;
+              }
+            else
+              {
+                /* The checksum is owned by path (an OstreeRepoFile) */
+                g_hash_table_insert (seen_checksums, (char *)checksum, g_object_ref (path));
+              }
+
             if (!ostree_repo_load_file (self, checksum, &file_in, &regular_file_info, NULL,
                                         cancellable, error))
               goto out;
@@ -1168,9 +1194,11 @@ ostree_repo_export_tree_to_archive (OstreeRepo *self, OstreeRepoExportArchiveOpt
 #ifdef HAVE_LIBARCHIVE
   gboolean ret = FALSE;
   struct archive *a = archive;
+  g_autoptr (GHashTable) seen_checksums
+      = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_object_unref);
 
   if (!write_directory_to_libarchive_recurse (self, opts, (GFile *)root, (GFile *)root, a,
-                                              cancellable, error))
+                                              seen_checksums, cancellable, error))
     goto out;
 
   ret = TRUE;
index 6b45790e387d15c3b97d059a577c230041da2518..f6bfd5fb6f29bfb7378a80bc43fcbedc5917dfad 100644 (file)
@@ -72,9 +72,9 @@ date > test-overlays/overlaid-file
 $OSTREE commit ${COMMIT_ARGS} -b test-base --base test2 --owner-uid 42 --owner-gid 42 test-overlays/
 $OSTREE ls -R test-base > ls.txt
 if can_create_whiteout_devices; then
-    assert_streq "$(wc -l < ls.txt)" 17
+    assert_streq "$(wc -l < ls.txt)" 22
 else
-    assert_streq "$(wc -l < ls.txt)" 14
+    assert_streq "$(wc -l < ls.txt)" 19
 fi
 
 assert_streq "$(grep '42.*42' ls.txt | wc -l)" 2
index fa9378270348c2ca2ea811aa2bfe0663d61ab766..d1c99eab8fc8e0e7ae65e630b63404e603fbf488 100755 (executable)
@@ -249,6 +249,13 @@ setup_test_repository () {
     mkdir baz/another/
     echo x > baz/another/y
 
+    mkdir baz/sub1
+    echo SAME_CONTENT > baz/sub1/duplicate_a
+    echo SAME_CONTENT > baz/sub1/duplicate_b
+
+    mkdir baz/sub2
+    echo SAME_CONTENT > baz/sub2/duplicate_c
+
     # if we are running inside a container we cannot test
     # the overlayfs whiteout marker passthrough
     if ! test -n "${OSTREE_NO_WHITEOUTS:-}"; then
index 5521285f49406b176b2c6ed3b2825e99ca8bd9ec..4b919734be69a534bc49ed18ceb1770f8d106260 100755 (executable)
@@ -38,7 +38,7 @@ orig_composefs_digest=$($OSTREE show --print-hex --print-metadata-key ostree.com
 $OSTREE commit ${COMMIT_ARGS} -b test-composefs2 --generate-composefs-metadata test2-co
 new_composefs_digest=$($OSTREE show --print-hex --print-metadata-key ostree.composefs.digest.v0 test-composefs2)
 assert_streq "${orig_composefs_digest}" "${new_composefs_digest}"
-assert_streq "${new_composefs_digest}" "7a53698f5aa7af7e8034a10bd2fcc195e9df46781efd967a3fc83d32a1d3eda1"
+assert_streq "${new_composefs_digest}" "be956966c70970ea23b1a8043bca58cfb0d011d490a35a7817b36d04c0210954"
 tap_ok "composefs metadata"
 
 tap_end
index e490ae404eb29f0ba00231b79ecd09e1d083df05..6b8de94c4ce068dccb15359d5ab8f0cdbe9f2f75 100755 (executable)
@@ -28,7 +28,7 @@ fi
 
 setup_test_repository "archive"
 
-echo '1..5'
+echo '1..6'
 
 $OSTREE checkout test2 test2-co
 $OSTREE commit --no-xattrs -b test2-noxattrs -s "test2 without xattrs" --tree=dir=test2-co
@@ -81,3 +81,11 @@ assert_file_empty diff.txt
 rm test2.tar diff.txt t -rf
 
 echo 'ok export import'
+
+cd ${test_tmpdir}
+${OSTREE} 'export' test2 -o test2.tar
+tar tvf test2.tar > test2.manifest
+assert_file_has_content test2.manifest 'baz/sub1/duplicate_b link to baz/sub1/duplicate_a'
+assert_file_has_content test2.manifest 'baz/sub2/duplicate_c link to baz/sub1/duplicate_a'
+
+echo 'ok export hard links'