aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-07-29 10:57:40 +0200
committerDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-07-29 10:58:42 +0200
commit864302059679c15bc223d37ed8cef87c5b4a97aa (patch)
tree419cdc8670878602c8b615e98937db5da723ca7a
parentdad72be359d2670dd4993101591e1b14638dedec (diff)
Cleanup: move deduplication code from data writer to fstree
Since it is actually completely independend of libsqfs and only works on file_info_t lists, it can be safely moved over to libfstree and the data writer becomes less cluttered as a result. Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
-rw-r--r--include/fstree.h21
-rw-r--r--lib/Makemodule.am2
-rw-r--r--lib/fstree/deduplicate.c132
-rw-r--r--lib/sqfs/data_writer.c127
4 files changed, 154 insertions, 128 deletions
diff --git a/include/fstree.h b/include/fstree.h
index 687b693..c327fc6 100644
--- a/include/fstree.h
+++ b/include/fstree.h
@@ -297,4 +297,25 @@ void tree_node_sort_recursive(tree_node_t *root);
/* resolve a path to a tree node. Returns NULL on failure and sets errno */
tree_node_t *fstree_node_from_path(fstree_t *fs, const char *path);
+/*
+ Walk through 'list' to find a file with a fragment that has
+ the same size ('frag_size') and checksum ('chksum') as 'fi'.
+ Processing stopps if 'fi' itself is found in the list.
+
+ Returns NULL if no such fragment could be found.
+*/
+file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum,
+ size_t frag_size, file_info_t *list,
+ size_t block_size);
+
+/*
+ Walk through 'list' to find a file that contains the same sequence of blocks
+ as 'file', comparing size and checksum. Processing stops if 'file' is found
+ in the list.
+
+ Returns NULL if no such fragment could be found.
+ */
+uint64_t find_equal_blocks(file_info_t *file, file_info_t *list,
+ size_t block_size);
+
#endif /* FSTREE_H */
diff --git a/lib/Makemodule.am b/lib/Makemodule.am
index 8258136..4288ef1 100644
--- a/lib/Makemodule.am
+++ b/lib/Makemodule.am
@@ -4,7 +4,7 @@ libfstree_a_SOURCES += lib/fstree/gen_inode_table.c lib/fstree/get_path.c
libfstree_a_SOURCES += lib/fstree/node_stat.c lib/fstree/mknode.c
libfstree_a_SOURCES += lib/fstree/add_by_path.c lib/fstree/xattr.c
libfstree_a_SOURCES += lib/fstree/node_from_path.c include/fstree.h
-libfstree_a_SOURCES += lib/fstree/gen_file_list.c
+libfstree_a_SOURCES += lib/fstree/gen_file_list.c lib/fstree/deduplicate.c
libfstree_a_CFLAGS = $(AM_CFLAGS)
libfstree_a_CPPFLAGS = $(AM_CPPFLAGS)
diff --git a/lib/fstree/deduplicate.c b/lib/fstree/deduplicate.c
new file mode 100644
index 0000000..9ae4440
--- /dev/null
+++ b/lib/fstree/deduplicate.c
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later */
+#include "config.h"
+#include "fstree.h"
+
+#include <string.h>
+
+file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum,
+ size_t frag_size, file_info_t *list,
+ size_t block_size)
+{
+ file_info_t *it;
+
+ for (it = list; it != NULL; it = it->next) {
+ if (it == fi) {
+ it = NULL;
+ break;
+ }
+
+ if (!(it->flags & FILE_FLAG_HAS_FRAGMENT))
+ continue;
+
+ if (it->flags & FILE_FLAG_FRAGMENT_IS_DUPLICATE)
+ continue;
+
+ if ((it->size % block_size) != frag_size)
+ continue;
+
+ if (it->fragment_chksum == chksum)
+ break;
+ }
+
+ return it;
+}
+
+static size_t get_block_count(file_info_t *fi, size_t block_size)
+{
+ size_t count = fi->size / block_size;
+
+ if ((fi->size % block_size) && !(fi->flags & FILE_FLAG_HAS_FRAGMENT))
+ ++count;
+
+ while (count > 0 && fi->blocks[count - 1].size == 0)
+ --count;
+
+ return count;
+}
+
+static size_t find_first_match(file_info_t *file, file_info_t *cmp,
+ size_t idx, size_t cmp_blk_count)
+{
+ size_t i;
+
+ for (i = 0; i < cmp_blk_count; ++i) {
+ if (memcmp(file->blocks + idx, cmp->blocks + i,
+ sizeof(file->blocks[idx])) == 0) {
+ break;
+ }
+ }
+
+ return i;
+}
+
+uint64_t find_equal_blocks(file_info_t *file, file_info_t *list,
+ size_t block_size)
+{
+ size_t start, first_match, i, j, block_count, cmp_blk_count;
+ uint64_t location;
+ file_info_t *it;
+
+ block_count = get_block_count(file, block_size);
+ if (block_count == 0)
+ return 0;
+
+ for (start = 0; start < block_count; ++start) {
+ if (file->blocks[start].size != 0)
+ break;
+ }
+
+ for (it = list; it != NULL; it = it->next) {
+ if (it == file) {
+ it = NULL;
+ break;
+ }
+
+ if (it->flags & FILE_FLAG_BLOCKS_ARE_DUPLICATE)
+ continue;
+
+ cmp_blk_count = get_block_count(it, block_size);
+ if (cmp_blk_count == 0)
+ continue;
+
+ first_match = find_first_match(file, it, start, cmp_blk_count);
+ if (first_match == cmp_blk_count)
+ continue;
+
+ i = start;
+ j = first_match;
+
+ while (i < block_count && j < cmp_blk_count) {
+ if (file->blocks[i].size == 0) {
+ ++i;
+ continue;
+ }
+
+ if (it->blocks[j].size == 0) {
+ ++j;
+ continue;
+ }
+
+ if (memcmp(it->blocks + j, file->blocks + i,
+ sizeof(file->blocks[i])) != 0) {
+ break;
+ }
+
+ ++i;
+ ++j;
+ }
+
+ if (i == block_count)
+ break;
+ }
+
+ if (it == NULL)
+ return 0;
+
+ location = it->startblock;
+
+ for (i = 0; i < first_match; ++i)
+ location += it->blocks[i].size & ((1 << 24) - 1);
+
+ return location;
+}
diff --git a/lib/sqfs/data_writer.c b/lib/sqfs/data_writer.c
index fddb9d3..4324f06 100644
--- a/lib/sqfs/data_writer.c
+++ b/lib/sqfs/data_writer.c
@@ -126,133 +126,6 @@ int data_writer_flush_fragments(data_writer_t *data)
return 0;
}
-static file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum,
- size_t frag_size, file_info_t *list,
- size_t block_size)
-{
- file_info_t *it;
-
- for (it = list; it != NULL; it = it->next) {
- if (it == fi) {
- it = NULL;
- break;
- }
-
- if (!(it->flags & FILE_FLAG_HAS_FRAGMENT))
- continue;
-
- if (it->flags & FILE_FLAG_FRAGMENT_IS_DUPLICATE)
- continue;
-
- if ((it->size % block_size) != frag_size)
- continue;
-
- if (it->fragment_chksum == chksum)
- break;
- }
-
- return it;
-}
-
-static size_t get_block_count(file_info_t *fi, size_t block_size)
-{
- size_t count = fi->size / block_size;
-
- if ((fi->size % block_size) && !(fi->flags & FILE_FLAG_HAS_FRAGMENT))
- ++count;
-
- while (count > 0 && fi->blocks[count - 1].size == 0)
- --count;
-
- return count;
-}
-
-static size_t find_first_match(file_info_t *file, file_info_t *cmp,
- size_t idx, size_t cmp_blk_count)
-{
- size_t i;
-
- for (i = 0; i < cmp_blk_count; ++i) {
- if (memcmp(file->blocks + idx, cmp->blocks + i,
- sizeof(file->blocks[idx])) == 0) {
- break;
- }
- }
-
- return i;
-}
-
-static uint64_t find_equal_blocks(file_info_t *file, file_info_t *list,
- size_t block_size)
-{
- size_t start, first_match, i, j, block_count, cmp_blk_count;
- uint64_t location;
- file_info_t *it;
-
- block_count = get_block_count(file, block_size);
- if (block_count == 0)
- return 0;
-
- for (start = 0; start < block_count; ++start) {
- if (file->blocks[start].size != 0)
- break;
- }
-
- for (it = list; it != NULL; it = it->next) {
- if (it == file) {
- it = NULL;
- break;
- }
-
- if (it->flags & FILE_FLAG_BLOCKS_ARE_DUPLICATE)
- continue;
-
- cmp_blk_count = get_block_count(it, block_size);
- if (cmp_blk_count == 0)
- continue;
-
- first_match = find_first_match(file, it, start, cmp_blk_count);
- if (first_match == cmp_blk_count)
- continue;
-
- i = start;
- j = first_match;
-
- while (i < block_count && j < cmp_blk_count) {
- if (file->blocks[i].size == 0) {
- ++i;
- continue;
- }
-
- if (it->blocks[j].size == 0) {
- ++j;
- continue;
- }
-
- if (memcmp(it->blocks + j, file->blocks + i,
- sizeof(file->blocks[i])) != 0) {
- break;
- }
-
- ++i;
- ++j;
- }
-
- if (i == block_count)
- break;
- }
-
- if (it == NULL)
- return 0;
-
- location = it->startblock;
-
- for (i = 0; i < first_match; ++i)
- location += it->blocks[i].size & ((1 << 24) - 1);
-
- return location;
-}
-
static int flush_data_block(data_writer_t *data, size_t size,
file_info_t *fi, int flags, file_info_t *list)
{