From 864302059679c15bc223d37ed8cef87c5b4a97aa Mon Sep 17 00:00:00 2001 From: David Oberhollenzer Date: Mon, 29 Jul 2019 10:57:40 +0200 Subject: Cleanup: move deduplication code from data writer to fstree Since it is actually completely independend of libsqfs and only works on file_info_t lists, it can be safely moved over to libfstree and the data writer becomes less cluttered as a result. Signed-off-by: David Oberhollenzer --- lib/fstree/deduplicate.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 lib/fstree/deduplicate.c (limited to 'lib/fstree') diff --git a/lib/fstree/deduplicate.c b/lib/fstree/deduplicate.c new file mode 100644 index 0000000..9ae4440 --- /dev/null +++ b/lib/fstree/deduplicate.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-3.0-or-later */ +#include "config.h" +#include "fstree.h" + +#include + +file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum, + size_t frag_size, file_info_t *list, + size_t block_size) +{ + file_info_t *it; + + for (it = list; it != NULL; it = it->next) { + if (it == fi) { + it = NULL; + break; + } + + if (!(it->flags & FILE_FLAG_HAS_FRAGMENT)) + continue; + + if (it->flags & FILE_FLAG_FRAGMENT_IS_DUPLICATE) + continue; + + if ((it->size % block_size) != frag_size) + continue; + + if (it->fragment_chksum == chksum) + break; + } + + return it; +} + +static size_t get_block_count(file_info_t *fi, size_t block_size) +{ + size_t count = fi->size / block_size; + + if ((fi->size % block_size) && !(fi->flags & FILE_FLAG_HAS_FRAGMENT)) + ++count; + + while (count > 0 && fi->blocks[count - 1].size == 0) + --count; + + return count; +} + +static size_t find_first_match(file_info_t *file, file_info_t *cmp, + size_t idx, size_t cmp_blk_count) +{ + size_t i; + + for (i = 0; i < cmp_blk_count; ++i) { + if (memcmp(file->blocks + idx, cmp->blocks + i, + sizeof(file->blocks[idx])) == 0) { + break; + } + } + + return i; +} + +uint64_t find_equal_blocks(file_info_t *file, file_info_t *list, + size_t block_size) +{ + size_t start, first_match, i, j, block_count, cmp_blk_count; + uint64_t location; + file_info_t *it; + + block_count = get_block_count(file, block_size); + if (block_count == 0) + return 0; + + for (start = 0; start < block_count; ++start) { + if (file->blocks[start].size != 0) + break; + } + + for (it = list; it != NULL; it = it->next) { + if (it == file) { + it = NULL; + break; + } + + if (it->flags & FILE_FLAG_BLOCKS_ARE_DUPLICATE) + continue; + + cmp_blk_count = get_block_count(it, block_size); + if (cmp_blk_count == 0) + continue; + + first_match = find_first_match(file, it, start, cmp_blk_count); + if (first_match == cmp_blk_count) + continue; + + i = start; + j = first_match; + + while (i < block_count && j < cmp_blk_count) { + if (file->blocks[i].size == 0) { + ++i; + continue; + } + + if (it->blocks[j].size == 0) { + ++j; + continue; + } + + if (memcmp(it->blocks + j, file->blocks + i, + sizeof(file->blocks[i])) != 0) { + break; + } + + ++i; + ++j; + } + + if (i == block_count) + break; + } + + if (it == NULL) + return 0; + + location = it->startblock; + + for (i = 0; i < first_match; ++i) + location += it->blocks[i].size & ((1 << 24) - 1); + + return location; +} -- cgit v1.2.3