summaryrefslogtreecommitdiff
path: root/lib/fstree
diff options
context:
space:
mode:
authorDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-07-29 10:57:40 +0200
committerDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-07-29 10:58:42 +0200
commit864302059679c15bc223d37ed8cef87c5b4a97aa (patch)
tree419cdc8670878602c8b615e98937db5da723ca7a /lib/fstree
parentdad72be359d2670dd4993101591e1b14638dedec (diff)
Cleanup: move deduplication code from data writer to fstree
Since it is actually completely independend of libsqfs and only works on file_info_t lists, it can be safely moved over to libfstree and the data writer becomes less cluttered as a result. Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
Diffstat (limited to 'lib/fstree')
-rw-r--r--lib/fstree/deduplicate.c132
1 files changed, 132 insertions, 0 deletions
diff --git a/lib/fstree/deduplicate.c b/lib/fstree/deduplicate.c
new file mode 100644
index 0000000..9ae4440
--- /dev/null
+++ b/lib/fstree/deduplicate.c
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later */
+#include "config.h"
+#include "fstree.h"
+
+#include <string.h>
+
+file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum,
+ size_t frag_size, file_info_t *list,
+ size_t block_size)
+{
+ file_info_t *it;
+
+ for (it = list; it != NULL; it = it->next) {
+ if (it == fi) {
+ it = NULL;
+ break;
+ }
+
+ if (!(it->flags & FILE_FLAG_HAS_FRAGMENT))
+ continue;
+
+ if (it->flags & FILE_FLAG_FRAGMENT_IS_DUPLICATE)
+ continue;
+
+ if ((it->size % block_size) != frag_size)
+ continue;
+
+ if (it->fragment_chksum == chksum)
+ break;
+ }
+
+ return it;
+}
+
+static size_t get_block_count(file_info_t *fi, size_t block_size)
+{
+ size_t count = fi->size / block_size;
+
+ if ((fi->size % block_size) && !(fi->flags & FILE_FLAG_HAS_FRAGMENT))
+ ++count;
+
+ while (count > 0 && fi->blocks[count - 1].size == 0)
+ --count;
+
+ return count;
+}
+
+static size_t find_first_match(file_info_t *file, file_info_t *cmp,
+ size_t idx, size_t cmp_blk_count)
+{
+ size_t i;
+
+ for (i = 0; i < cmp_blk_count; ++i) {
+ if (memcmp(file->blocks + idx, cmp->blocks + i,
+ sizeof(file->blocks[idx])) == 0) {
+ break;
+ }
+ }
+
+ return i;
+}
+
+uint64_t find_equal_blocks(file_info_t *file, file_info_t *list,
+ size_t block_size)
+{
+ size_t start, first_match, i, j, block_count, cmp_blk_count;
+ uint64_t location;
+ file_info_t *it;
+
+ block_count = get_block_count(file, block_size);
+ if (block_count == 0)
+ return 0;
+
+ for (start = 0; start < block_count; ++start) {
+ if (file->blocks[start].size != 0)
+ break;
+ }
+
+ for (it = list; it != NULL; it = it->next) {
+ if (it == file) {
+ it = NULL;
+ break;
+ }
+
+ if (it->flags & FILE_FLAG_BLOCKS_ARE_DUPLICATE)
+ continue;
+
+ cmp_blk_count = get_block_count(it, block_size);
+ if (cmp_blk_count == 0)
+ continue;
+
+ first_match = find_first_match(file, it, start, cmp_blk_count);
+ if (first_match == cmp_blk_count)
+ continue;
+
+ i = start;
+ j = first_match;
+
+ while (i < block_count && j < cmp_blk_count) {
+ if (file->blocks[i].size == 0) {
+ ++i;
+ continue;
+ }
+
+ if (it->blocks[j].size == 0) {
+ ++j;
+ continue;
+ }
+
+ if (memcmp(it->blocks + j, file->blocks + i,
+ sizeof(file->blocks[i])) != 0) {
+ break;
+ }
+
+ ++i;
+ ++j;
+ }
+
+ if (i == block_count)
+ break;
+ }
+
+ if (it == NULL)
+ return 0;
+
+ location = it->startblock;
+
+ for (i = 0; i < first_match; ++i)
+ location += it->blocks[i].size & ((1 << 24) - 1);
+
+ return location;
+}