summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-08-04 16:56:08 +0200
committerDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-08-04 17:01:14 +0200
commit3a340b12eb9b7ed86a47391345cb836fa662b2d9 (patch)
treecf645d3ccef3347f10985f415a0755a5b0de36b9
parentbf1dd4f1ab8ef70f96704c4e2bd95968e1615b37 (diff)
Improve file unpacking order
This commit moves the file unpacking order & job scheduling to a libfstree function. The ordering is improved by making sure fragment blocks are not extracted more than once and files with data blocks are extracted in order. This way, serial unpacking of a 2GiB Debian live image could be reduced from ~5' on my test machine to ~3.5', whereas parallel unpacking stays roughly the same (~3' for -j 4). Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
-rw-r--r--include/fstree.h9
-rw-r--r--lib/Makemodule.am1
-rw-r--r--lib/fstree/optimize_unpack_order.c161
-rw-r--r--unpack/fill_files.c78
4 files changed, 192 insertions, 57 deletions
diff --git a/include/fstree.h b/include/fstree.h
index 608ff97..dbc76db 100644
--- a/include/fstree.h
+++ b/include/fstree.h
@@ -323,4 +323,13 @@ file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum,
uint64_t find_equal_blocks(file_info_t *file, file_info_t *list,
size_t block_size);
+/*
+ Optimize the order of the fstree file list for unpacking as to avoid
+ unpacking fragment blocks more than once and to improve locality when
+ fetching data from disk. The resulting list is returned in 'out'.
+ If num_jobs is > 1, the list is split up for parallel processing.
+ */
+void optimize_unpack_order(fstree_t *fs, size_t num_jobs,
+ file_info_t *out[num_jobs]);
+
#endif /* FSTREE_H */
diff --git a/lib/Makemodule.am b/lib/Makemodule.am
index f49ff91..c3451f4 100644
--- a/lib/Makemodule.am
+++ b/lib/Makemodule.am
@@ -5,6 +5,7 @@ libfstree_a_SOURCES += lib/fstree/node_stat.c lib/fstree/mknode.c
libfstree_a_SOURCES += lib/fstree/add_by_path.c lib/fstree/xattr.c
libfstree_a_SOURCES += lib/fstree/node_from_path.c include/fstree.h
libfstree_a_SOURCES += lib/fstree/gen_file_list.c lib/fstree/deduplicate.c
+libfstree_a_SOURCES += lib/fstree/optimize_unpack_order.c
libfstree_a_CFLAGS = $(AM_CFLAGS)
libfstree_a_CPPFLAGS = $(AM_CPPFLAGS)
diff --git a/lib/fstree/optimize_unpack_order.c b/lib/fstree/optimize_unpack_order.c
new file mode 100644
index 0000000..ad036e8
--- /dev/null
+++ b/lib/fstree/optimize_unpack_order.c
@@ -0,0 +1,161 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later */
+/*
+ * optimize_unpack_order.c
+ *
+ * Copyright (C) 2019 David Oberhollenzer <goliath@infraroot.at>
+ */
+#include "config.h"
+#include "fstree.h"
+
+static int compare_files(const fstree_t *fs, const file_info_t *lhs,
+ const file_info_t *rhs)
+{
+ /* NOOP < everything else */
+ if (lhs->input_file == NULL)
+ return rhs->input_file == NULL ? 0 : -1;
+
+ if (rhs->input_file == NULL)
+ return 1;
+
+ /* Files with fragments come first, ordered by ID.
+ In case of tie, files without data blocks come first,
+ and the others are ordered by start block. */
+ if (lhs->flags & FILE_FLAG_HAS_FRAGMENT) {
+ if (!(rhs->flags & FILE_FLAG_HAS_FRAGMENT))
+ return -1;
+
+ if (lhs->fragment < rhs->fragment)
+ return -1;
+ if (lhs->fragment > rhs->fragment)
+ return 1;
+
+ if (lhs->size < fs->block_size)
+ return (rhs->size < fs->block_size) ? 0 : -1;
+ if (rhs->size < fs->block_size)
+ return 1;
+ goto order_by_start;
+ }
+
+ if (rhs->flags & FILE_FLAG_HAS_FRAGMENT)
+ return 1;
+
+ /* order the rest by start block */
+order_by_start:
+ return lhs->startblock < rhs->startblock ? -1 :
+ lhs->startblock > rhs->startblock ? 1 : 0;
+}
+
+/* TODO: unify ad-hoc merge sort with the one in fstree_sort */
+static file_info_t *merge(const fstree_t *fs, file_info_t *lhs,
+ file_info_t *rhs)
+{
+ file_info_t *it;
+ file_info_t *head = NULL;
+ file_info_t **next_ptr = &head;
+
+ while (lhs != NULL && rhs != NULL) {
+ if (compare_files(fs, lhs, rhs) <= 0) {
+ it = lhs;
+ lhs = lhs->next;
+ } else {
+ it = rhs;
+ rhs = rhs->next;
+ }
+
+ *next_ptr = it;
+ next_ptr = &it->next;
+ }
+
+ it = (lhs != NULL ? lhs : rhs);
+ *next_ptr = it;
+ return head;
+}
+
+static file_info_t *list_sort(const fstree_t *fs, file_info_t *head)
+{
+ file_info_t *it, *half, *prev;
+
+ it = half = prev = head;
+ while (it != NULL) {
+ prev = half;
+ half = half->next;
+ it = it->next;
+ if (it != NULL) {
+ it = it->next;
+ }
+ }
+
+ // half refers to the (count/2)'th element ROUNDED UP.
+ // It will be null therefore only in the empty and the
+ // single element list
+ if (half == NULL) {
+ return head;
+ }
+
+ prev->next = NULL;
+
+ return merge(fs, list_sort(fs, head), list_sort(fs, half));
+}
+
+static file_info_t *split_list(file_info_t *list, uint64_t threashold)
+{
+ file_info_t *it, *new = NULL;
+ uint64_t size = 0;
+
+ for (it = list; it != NULL; it = it->next) {
+ size += it->size - it->sparse;
+
+ if (size >= threashold) {
+ new = it->next;
+ it->next = NULL;
+ break;
+ }
+ }
+
+ return new;
+}
+
+static uint64_t total_size(file_info_t *list)
+{
+ uint64_t size = 0;
+ file_info_t *it;
+
+ for (it = list; it != NULL; it = it->next)
+ size += it->size - it->sparse;
+
+ return size;
+}
+
+void optimize_unpack_order(fstree_t *fs, size_t num_jobs,
+ file_info_t *out[num_jobs])
+{
+ file_info_t *file_list;
+ uint64_t threshold;
+ size_t i;
+
+ if (num_jobs < 1)
+ return;
+
+ for (i = 0; i < num_jobs; ++i)
+ out[i] = NULL;
+
+ file_list = list_sort(fs, fs->files);
+ while (file_list != NULL && file_list->input_file == NULL)
+ file_list = file_list->next;
+
+ fs->files = NULL;
+
+ if (num_jobs < 2) {
+ out[0] = file_list;
+ return;
+ }
+
+ threshold = total_size(file_list) / num_jobs;
+
+ for (i = 0; i < (num_jobs - 1); ++i) {
+ out[i] = file_list;
+ file_list = split_list(file_list, threshold);
+ }
+
+ out[i] = file_list;
+}
diff --git a/unpack/fill_files.c b/unpack/fill_files.c
index e104ae7..c0fc26c 100644
--- a/unpack/fill_files.c
+++ b/unpack/fill_files.c
@@ -38,64 +38,23 @@ static int fill_files(data_reader_t *data, file_info_t *list, int flags)
return 0;
}
-static file_info_t *split_list(file_info_t *list, uint64_t threashold)
-{
- file_info_t *it, *new = NULL;
- uint64_t size = 0;
-
- for (it = list; it != NULL; it = it->next) {
- if (it->input_file == NULL)
- continue;
-
- size += it->size - it->sparse;
-
- if (size >= threashold) {
- new = it->next;
- it->next = NULL;
- break;
- }
- }
-
- return new;
-}
-
-static uint64_t total_size(file_info_t *list)
-{
- uint64_t size = 0;
- file_info_t *it;
-
- for (it = list; it != NULL; it = it->next) {
- if (it->input_file != NULL)
- size += it->size - it->sparse;
- }
-
- return size;
-}
-
int fill_unpacked_files(fstree_t *fs, data_reader_t *data, int flags,
unsigned int num_jobs)
{
- file_info_t *sublists[num_jobs], *it;
+ file_info_t **sublists, *it;
int exitstatus, status = 0;
- uint64_t threshold;
unsigned int i;
pid_t pid;
- if (num_jobs <= 1) {
- status = fill_files(data, fs->files, flags);
+ if (num_jobs < 1)
+ num_jobs = 1;
- for (it = fs->files; it != NULL; it = it->next)
- free(it->input_file);
-
- return status;
- }
-
- threshold = total_size(fs->files) / num_jobs;
+ sublists = alloca(sizeof(sublists[0]) * num_jobs);
+ optimize_unpack_order(fs, num_jobs, sublists);
- for (i = 0; i < num_jobs; ++i) {
- sublists[i] = fs->files;
-
- fs->files = split_list(fs->files, threshold);
+ if (num_jobs < 2) {
+ status = fill_files(data, sublists[0], flags);
+ goto out;
}
for (i = 0; i < num_jobs; ++i) {
@@ -113,26 +72,31 @@ int fill_unpacked_files(fstree_t *fs, data_reader_t *data, int flags,
if (pid < 0) {
perror("fork");
status = -1;
- num_jobs = i;
break;
}
}
- for (i = 0; i < num_jobs; ++i) {
- do {
- pid = waitpid(-1, &exitstatus, 0);
+ for (;;) {
+ errno = 0;
+ pid = waitpid(-1, &exitstatus, 0);
+
+ if (pid < 0) {
+ if (errno == EINTR)
+ continue;
if (errno == ECHILD)
- goto out;
- } while (pid < 0);
+ break;
+ }
if (!WIFEXITED(exitstatus) ||
WEXITSTATUS(exitstatus) != EXIT_SUCCESS) {
status = -1;
}
-
+ }
+out:
+ for (i = 0; i < num_jobs; ++i) {
for (it = sublists[i]; it != NULL; it = it->next)
free(it->input_file);
}
-out:
+
return status;
}