From 864302059679c15bc223d37ed8cef87c5b4a97aa Mon Sep 17 00:00:00 2001 From: David Oberhollenzer Date: Mon, 29 Jul 2019 10:57:40 +0200 Subject: Cleanup: move deduplication code from data writer to fstree Since it is actually completely independend of libsqfs and only works on file_info_t lists, it can be safely moved over to libfstree and the data writer becomes less cluttered as a result. Signed-off-by: David Oberhollenzer --- include/fstree.h | 21 ++++++++ lib/Makemodule.am | 2 +- lib/fstree/deduplicate.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++ lib/sqfs/data_writer.c | 127 --------------------------------------------- 4 files changed, 154 insertions(+), 128 deletions(-) create mode 100644 lib/fstree/deduplicate.c diff --git a/include/fstree.h b/include/fstree.h index 687b693..c327fc6 100644 --- a/include/fstree.h +++ b/include/fstree.h @@ -297,4 +297,25 @@ void tree_node_sort_recursive(tree_node_t *root); /* resolve a path to a tree node. Returns NULL on failure and sets errno */ tree_node_t *fstree_node_from_path(fstree_t *fs, const char *path); +/* + Walk through 'list' to find a file with a fragment that has + the same size ('frag_size') and checksum ('chksum') as 'fi'. + Processing stopps if 'fi' itself is found in the list. + + Returns NULL if no such fragment could be found. +*/ +file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum, + size_t frag_size, file_info_t *list, + size_t block_size); + +/* + Walk through 'list' to find a file that contains the same sequence of blocks + as 'file', comparing size and checksum. Processing stops if 'file' is found + in the list. + + Returns NULL if no such fragment could be found. + */ +uint64_t find_equal_blocks(file_info_t *file, file_info_t *list, + size_t block_size); + #endif /* FSTREE_H */ diff --git a/lib/Makemodule.am b/lib/Makemodule.am index 8258136..4288ef1 100644 --- a/lib/Makemodule.am +++ b/lib/Makemodule.am @@ -4,7 +4,7 @@ libfstree_a_SOURCES += lib/fstree/gen_inode_table.c lib/fstree/get_path.c libfstree_a_SOURCES += lib/fstree/node_stat.c lib/fstree/mknode.c libfstree_a_SOURCES += lib/fstree/add_by_path.c lib/fstree/xattr.c libfstree_a_SOURCES += lib/fstree/node_from_path.c include/fstree.h -libfstree_a_SOURCES += lib/fstree/gen_file_list.c +libfstree_a_SOURCES += lib/fstree/gen_file_list.c lib/fstree/deduplicate.c libfstree_a_CFLAGS = $(AM_CFLAGS) libfstree_a_CPPFLAGS = $(AM_CPPFLAGS) diff --git a/lib/fstree/deduplicate.c b/lib/fstree/deduplicate.c new file mode 100644 index 0000000..9ae4440 --- /dev/null +++ b/lib/fstree/deduplicate.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-3.0-or-later */ +#include "config.h" +#include "fstree.h" + +#include + +file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum, + size_t frag_size, file_info_t *list, + size_t block_size) +{ + file_info_t *it; + + for (it = list; it != NULL; it = it->next) { + if (it == fi) { + it = NULL; + break; + } + + if (!(it->flags & FILE_FLAG_HAS_FRAGMENT)) + continue; + + if (it->flags & FILE_FLAG_FRAGMENT_IS_DUPLICATE) + continue; + + if ((it->size % block_size) != frag_size) + continue; + + if (it->fragment_chksum == chksum) + break; + } + + return it; +} + +static size_t get_block_count(file_info_t *fi, size_t block_size) +{ + size_t count = fi->size / block_size; + + if ((fi->size % block_size) && !(fi->flags & FILE_FLAG_HAS_FRAGMENT)) + ++count; + + while (count > 0 && fi->blocks[count - 1].size == 0) + --count; + + return count; +} + +static size_t find_first_match(file_info_t *file, file_info_t *cmp, + size_t idx, size_t cmp_blk_count) +{ + size_t i; + + for (i = 0; i < cmp_blk_count; ++i) { + if (memcmp(file->blocks + idx, cmp->blocks + i, + sizeof(file->blocks[idx])) == 0) { + break; + } + } + + return i; +} + +uint64_t find_equal_blocks(file_info_t *file, file_info_t *list, + size_t block_size) +{ + size_t start, first_match, i, j, block_count, cmp_blk_count; + uint64_t location; + file_info_t *it; + + block_count = get_block_count(file, block_size); + if (block_count == 0) + return 0; + + for (start = 0; start < block_count; ++start) { + if (file->blocks[start].size != 0) + break; + } + + for (it = list; it != NULL; it = it->next) { + if (it == file) { + it = NULL; + break; + } + + if (it->flags & FILE_FLAG_BLOCKS_ARE_DUPLICATE) + continue; + + cmp_blk_count = get_block_count(it, block_size); + if (cmp_blk_count == 0) + continue; + + first_match = find_first_match(file, it, start, cmp_blk_count); + if (first_match == cmp_blk_count) + continue; + + i = start; + j = first_match; + + while (i < block_count && j < cmp_blk_count) { + if (file->blocks[i].size == 0) { + ++i; + continue; + } + + if (it->blocks[j].size == 0) { + ++j; + continue; + } + + if (memcmp(it->blocks + j, file->blocks + i, + sizeof(file->blocks[i])) != 0) { + break; + } + + ++i; + ++j; + } + + if (i == block_count) + break; + } + + if (it == NULL) + return 0; + + location = it->startblock; + + for (i = 0; i < first_match; ++i) + location += it->blocks[i].size & ((1 << 24) - 1); + + return location; +} diff --git a/lib/sqfs/data_writer.c b/lib/sqfs/data_writer.c index fddb9d3..4324f06 100644 --- a/lib/sqfs/data_writer.c +++ b/lib/sqfs/data_writer.c @@ -126,133 +126,6 @@ int data_writer_flush_fragments(data_writer_t *data) return 0; } -static file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum, - size_t frag_size, file_info_t *list, - size_t block_size) -{ - file_info_t *it; - - for (it = list; it != NULL; it = it->next) { - if (it == fi) { - it = NULL; - break; - } - - if (!(it->flags & FILE_FLAG_HAS_FRAGMENT)) - continue; - - if (it->flags & FILE_FLAG_FRAGMENT_IS_DUPLICATE) - continue; - - if ((it->size % block_size) != frag_size) - continue; - - if (it->fragment_chksum == chksum) - break; - } - - return it; -} - -static size_t get_block_count(file_info_t *fi, size_t block_size) -{ - size_t count = fi->size / block_size; - - if ((fi->size % block_size) && !(fi->flags & FILE_FLAG_HAS_FRAGMENT)) - ++count; - - while (count > 0 && fi->blocks[count - 1].size == 0) - --count; - - return count; -} - -static size_t find_first_match(file_info_t *file, file_info_t *cmp, - size_t idx, size_t cmp_blk_count) -{ - size_t i; - - for (i = 0; i < cmp_blk_count; ++i) { - if (memcmp(file->blocks + idx, cmp->blocks + i, - sizeof(file->blocks[idx])) == 0) { - break; - } - } - - return i; -} - -static uint64_t find_equal_blocks(file_info_t *file, file_info_t *list, - size_t block_size) -{ - size_t start, first_match, i, j, block_count, cmp_blk_count; - uint64_t location; - file_info_t *it; - - block_count = get_block_count(file, block_size); - if (block_count == 0) - return 0; - - for (start = 0; start < block_count; ++start) { - if (file->blocks[start].size != 0) - break; - } - - for (it = list; it != NULL; it = it->next) { - if (it == file) { - it = NULL; - break; - } - - if (it->flags & FILE_FLAG_BLOCKS_ARE_DUPLICATE) - continue; - - cmp_blk_count = get_block_count(it, block_size); - if (cmp_blk_count == 0) - continue; - - first_match = find_first_match(file, it, start, cmp_blk_count); - if (first_match == cmp_blk_count) - continue; - - i = start; - j = first_match; - - while (i < block_count && j < cmp_blk_count) { - if (file->blocks[i].size == 0) { - ++i; - continue; - } - - if (it->blocks[j].size == 0) { - ++j; - continue; - } - - if (memcmp(it->blocks + j, file->blocks + i, - sizeof(file->blocks[i])) != 0) { - break; - } - - ++i; - ++j; - } - - if (i == block_count) - break; - } - - if (it == NULL) - return 0; - - location = it->startblock; - - for (i = 0; i < first_match; ++i) - location += it->blocks[i].size & ((1 << 24) - 1); - - return location; -} - static int flush_data_block(data_writer_t *data, size_t size, file_info_t *fi, int flags, file_info_t *list) { -- cgit v1.2.3