aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-09-14 01:41:38 +0200
committerDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-09-14 04:10:45 +0200
commitf57814332a69bebc40e25e6537a3c08fc9e18f97 (patch)
tree7ff880b8eb53f4852c6f0be9436f220643219795 /lib
parentd455ff92da0249e731cff7613f42b0f7359775da (diff)
Move data deduplication from fstree code to data writer
Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
Diffstat (limited to 'lib')
-rw-r--r--lib/fstree/Makemodule.am2
-rw-r--r--lib/fstree/deduplicate.c133
-rw-r--r--lib/fstree/mknode.c6
-rw-r--r--lib/sqfshelper/data_reader.c14
-rw-r--r--lib/sqfshelper/data_writer.c203
-rw-r--r--lib/sqfshelper/statistics.c2
-rw-r--r--lib/sqfshelper/tree_node_from_inode.c6
-rw-r--r--lib/sqfshelper/tree_node_to_inode.c2
8 files changed, 174 insertions, 194 deletions
diff --git a/lib/fstree/Makemodule.am b/lib/fstree/Makemodule.am
index 3c74f6d..c3c56fb 100644
--- a/lib/fstree/Makemodule.am
+++ b/lib/fstree/Makemodule.am
@@ -4,7 +4,7 @@ libfstree_a_SOURCES += lib/fstree/gen_inode_table.c lib/fstree/get_path.c
libfstree_a_SOURCES += lib/fstree/node_stat.c lib/fstree/mknode.c
libfstree_a_SOURCES += lib/fstree/add_by_path.c lib/fstree/xattr.c
libfstree_a_SOURCES += lib/fstree/node_from_path.c include/fstree.h
-libfstree_a_SOURCES += lib/fstree/gen_file_list.c lib/fstree/deduplicate.c
+libfstree_a_SOURCES += lib/fstree/gen_file_list.c
libfstree_a_SOURCES += lib/fstree/optimize_unpack_order.c
libfstree_a_SOURCES += lib/fstree/canonicalize_name.c
libfstree_a_SOURCES += lib/fstree/source_date_epoch.c
diff --git a/lib/fstree/deduplicate.c b/lib/fstree/deduplicate.c
deleted file mode 100644
index 00815a6..0000000
--- a/lib/fstree/deduplicate.c
+++ /dev/null
@@ -1,133 +0,0 @@
-/* SPDX-License-Identifier: GPL-3.0-or-later */
-/*
- * deduplicate.c
- *
- * Copyright (C) 2019 David Oberhollenzer <goliath@infraroot.at>
- */
-#include "config.h"
-#include "fstree.h"
-
-#include <string.h>
-
-file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum,
- size_t frag_size, file_info_t *list,
- size_t block_size)
-{
- file_info_t *it;
-
- for (it = list; it != NULL; it = it->next) {
- if (it == fi)
- continue;
-
- if (!(it->flags & FILE_FLAG_HAS_FRAGMENT))
- continue;
-
- if (it->flags & FILE_FLAG_FRAGMENT_IS_DUPLICATE)
- continue;
-
- if ((it->size % block_size) != frag_size)
- continue;
-
- if (it->fragment_chksum == chksum)
- break;
- }
-
- return it;
-}
-
-static size_t get_block_count(file_info_t *fi, size_t block_size)
-{
- size_t count = fi->size / block_size;
-
- if ((fi->size % block_size) && !(fi->flags & FILE_FLAG_HAS_FRAGMENT))
- ++count;
-
- while (count > 0 && fi->blocks[count - 1].size == 0)
- --count;
-
- return count;
-}
-
-static size_t find_first_match(file_info_t *file, file_info_t *cmp,
- size_t idx, size_t cmp_blk_count)
-{
- size_t i;
-
- for (i = 0; i < cmp_blk_count; ++i) {
- if (memcmp(file->blocks + idx, cmp->blocks + i,
- sizeof(file->blocks[idx])) == 0) {
- break;
- }
- }
-
- return i;
-}
-
-uint64_t find_equal_blocks(file_info_t *file, file_info_t *list,
- size_t block_size)
-{
- size_t start, first_match, i, j, block_count, cmp_blk_count;
- uint64_t location;
- file_info_t *it;
-
- block_count = get_block_count(file, block_size);
- if (block_count == 0)
- return 0;
-
- for (start = 0; start < block_count; ++start) {
- if (file->blocks[start].size != 0)
- break;
- }
-
- for (it = list; it != NULL; it = it->next) {
- if (it == file)
- continue;
-
- if (it->flags & FILE_FLAG_BLOCKS_ARE_DUPLICATE)
- continue;
-
- cmp_blk_count = get_block_count(it, block_size);
- if (cmp_blk_count == 0)
- continue;
-
- first_match = find_first_match(file, it, start, cmp_blk_count);
- if (first_match == cmp_blk_count)
- continue;
-
- i = start;
- j = first_match;
-
- while (i < block_count && j < cmp_blk_count) {
- if (file->blocks[i].size == 0) {
- ++i;
- continue;
- }
-
- if (it->blocks[j].size == 0) {
- ++j;
- continue;
- }
-
- if (memcmp(it->blocks + j, file->blocks + i,
- sizeof(file->blocks[i])) != 0) {
- break;
- }
-
- ++i;
- ++j;
- }
-
- if (i == block_count)
- break;
- }
-
- if (it == NULL)
- return 0;
-
- location = it->startblock;
-
- for (i = 0; i < first_match; ++i)
- location += it->blocks[i].size & ((1 << 24) - 1);
-
- return location;
-}
diff --git a/lib/fstree/mknode.c b/lib/fstree/mknode.c
index 1c3b3a0..ace99f3 100644
--- a/lib/fstree/mknode.c
+++ b/lib/fstree/mknode.c
@@ -41,7 +41,7 @@ tree_node_t *fstree_mknode(fstree_t *fs, tree_node_t *parent, const char *name,
if ((sb->st_size % fs->block_size) != 0)
++block_count;
- if (SZ_MUL_OV(block_count, sizeof(n->data.file->blocks[0]),
+ if (SZ_MUL_OV(block_count, sizeof(n->data.file->block_size[0]),
&total)) {
goto fail_ov;
}
@@ -92,8 +92,8 @@ tree_node_t *fstree_mknode(fstree_t *fs, tree_node_t *parent, const char *name,
if (extra == NULL)
break;
- ptr = (char *)n->data.file->blocks;
- ptr += block_count * sizeof(n->data.file->blocks[0]);
+ ptr = (char *)n->data.file->block_size;
+ ptr += block_count * sizeof(n->data.file->block_size[0]);
n->data.file->input_file = ptr;
strcpy(n->data.file->input_file, extra);
break;
diff --git a/lib/sqfshelper/data_reader.c b/lib/sqfshelper/data_reader.c
index d1a9cce..6906933 100644
--- a/lib/sqfshelper/data_reader.c
+++ b/lib/sqfshelper/data_reader.c
@@ -193,7 +193,7 @@ int data_reader_dump_file(data_reader_t *data, file_info_t *fi, int outfd,
diff = filesz > data->block_size ? data->block_size : filesz;
filesz -= diff;
- if (SQFS_IS_SPARSE_BLOCK(fi->blocks[i].size)) {
+ if (SQFS_IS_SPARSE_BLOCK(fi->block_size[i])) {
if (allow_sparse) {
if (lseek(outfd, diff, SEEK_CUR) == (off_t)-1)
goto fail_sparse;
@@ -201,9 +201,9 @@ int data_reader_dump_file(data_reader_t *data, file_info_t *fi, int outfd,
}
memset(data->block, 0, diff);
} else {
- if (precache_data_block(data, off, fi->blocks[i].size))
+ if (precache_data_block(data, off, fi->block_size[i]))
return -1;
- off += SQFS_ON_DISK_BLOCK_SIZE(fi->blocks[i].size);
+ off += SQFS_ON_DISK_BLOCK_SIZE(fi->block_size[i]);
}
if (write_data("writing uncompressed block",
@@ -259,7 +259,7 @@ ssize_t data_reader_read(data_reader_t *data, file_info_t *fi,
i = 0;
while (offset > data->block_size && i < count) {
- off += SQFS_ON_DISK_BLOCK_SIZE(fi->blocks[i++].size);
+ off += SQFS_ON_DISK_BLOCK_SIZE(fi->block_size[i++]);
offset -= data->block_size;
}
@@ -269,14 +269,14 @@ ssize_t data_reader_read(data_reader_t *data, file_info_t *fi,
if (size < diff)
size = diff;
- if (SQFS_IS_SPARSE_BLOCK(fi->blocks[i].size)) {
+ if (SQFS_IS_SPARSE_BLOCK(fi->block_size[i])) {
memset(buffer, 0, diff);
} else {
- if (precache_data_block(data, off, fi->blocks[i].size))
+ if (precache_data_block(data, off, fi->block_size[i]))
return -1;
memcpy(buffer, (char *)data->block + offset, diff);
- off += SQFS_ON_DISK_BLOCK_SIZE(fi->blocks[i].size);
+ off += SQFS_ON_DISK_BLOCK_SIZE(fi->block_size[i]);
}
++i;
diff --git a/lib/sqfshelper/data_writer.c b/lib/sqfshelper/data_writer.c
index d6a935b..7d84762 100644
--- a/lib/sqfshelper/data_writer.c
+++ b/lib/sqfshelper/data_writer.c
@@ -18,6 +18,24 @@
#include <errno.h>
#include <zlib.h>
+#define MK_BLK_SIG(chksum, size) \
+ (((uint64_t)(size) << 32) | (uint64_t)(chksum))
+
+#define BLK_SIZE(sig) ((sig) >> 32)
+
+#define INIT_BLOCK_COUNT (128)
+
+typedef struct {
+ uint64_t offset;
+ uint64_t signature;
+} blk_info_t;
+
+typedef struct {
+ uint32_t index;
+ uint32_t offset;
+ uint64_t signature;
+} frag_info_t;
+
struct data_writer_t {
sqfs_block_t *frag_block;
sqfs_fragment_t *fragments;
@@ -29,9 +47,17 @@ struct data_writer_t {
sqfs_block_processor_t *proc;
sqfs_compressor_t *cmp;
- file_info_t *list;
sqfs_super_t *super;
sqfs_file_t *file;
+
+ size_t file_start;
+ size_t num_blocks;
+ size_t max_blocks;
+ blk_info_t *blocks;
+
+ size_t frag_list_num;
+ size_t frag_list_max;
+ frag_info_t *frag_list;
};
enum {
@@ -47,20 +73,63 @@ static int allign_file(data_writer_t *data)
data->devblksz);
}
+static int store_block_location(data_writer_t *data, uint64_t offset,
+ uint32_t size, uint32_t chksum)
+{
+ size_t new_sz;
+ void *new;
+
+ if (data->num_blocks == data->max_blocks) {
+ new_sz = data->max_blocks * 2;
+ new = realloc(data->blocks, sizeof(data->blocks[0]) * new_sz);
+
+ if (new == NULL) {
+ perror("growing data block checksum table");
+ return -1;
+ }
+
+ data->blocks = new;
+ data->max_blocks = new_sz;
+ }
+
+ data->blocks[data->num_blocks].offset = offset;
+ data->blocks[data->num_blocks].signature = MK_BLK_SIG(chksum, size);
+ data->num_blocks += 1;
+ return 0;
+}
+
+static size_t deduplicate_blocks(data_writer_t *data, size_t count)
+{
+ size_t i, j;
+
+ for (i = 0; i < data->file_start; ++i) {
+ for (j = 0; j < count; ++j) {
+ if (data->blocks[i + j].signature !=
+ data->blocks[data->file_start + j].signature)
+ break;
+ }
+
+ if (j == count)
+ break;
+ }
+
+ return i;
+}
+
static int block_callback(void *user, sqfs_block_t *blk)
{
file_info_t *fi = blk->user;
data_writer_t *data = user;
- uint64_t ref, offset;
+ size_t start, count;
+ uint64_t offset;
uint32_t out;
if (blk->flags & BLK_FIRST_BLOCK) {
data->start = data->file->get_size(data->file);
+ data->file_start = data->num_blocks;
if ((blk->flags & BLK_ALLIGN) && allign_file(data) != 0)
return -1;
-
- fi->startblock = data->file->get_size(data->file);
}
if (blk->size != 0) {
@@ -68,20 +137,22 @@ static int block_callback(void *user, sqfs_block_t *blk)
if (!(blk->flags & SQFS_BLK_IS_COMPRESSED))
out |= 1 << 24;
+ offset = data->file->get_size(data->file);
+
if (blk->flags & BLK_FRAGMENT_BLOCK) {
- offset = htole64(data->file->get_size(data->file));
- data->fragments[blk->index].start_offset = offset;
+ data->fragments[blk->index].start_offset = htole64(offset);
data->fragments[blk->index].pad0 = 0;
data->fragments[blk->index].size = htole32(out);
data->super->flags &= ~SQFS_FLAG_NO_FRAGMENTS;
data->super->flags |= SQFS_FLAG_ALWAYS_FRAGMENTS;
} else {
- fi->blocks[blk->index].chksum = blk->checksum;
- fi->blocks[blk->index].size = htole32(out);
- }
+ fi->block_size[blk->index] = htole32(out);
- offset = data->file->get_size(data->file);
+ if (store_block_location(data, offset, out,
+ blk->checksum))
+ return -1;
+ }
if (data->file->write_at(data->file, offset,
blk->data, blk->size)) {
@@ -93,11 +164,14 @@ static int block_callback(void *user, sqfs_block_t *blk)
if ((blk->flags & BLK_ALLIGN) && allign_file(data) != 0)
return -1;
- ref = find_equal_blocks(fi, data->list,
- data->super->block_size);
- if (ref > 0) {
- fi->startblock = ref;
+ count = data->num_blocks - data->file_start;
+ start = deduplicate_blocks(data, count);
+
+ fi->startblock = data->blocks[start].offset;
+
+ if (start + count < data->file_start) {
fi->flags |= FILE_FLAG_BLOCKS_ARE_DUPLICATE;
+ data->num_blocks = data->file_start;
if (data->file->truncate(data->file, data->start)) {
perror("truncating squashfs image after "
@@ -139,10 +213,25 @@ static int flush_fragment_block(data_writer_t *data)
return ret;
}
-static int store_fragment(data_writer_t *data, sqfs_block_t *frag)
+static int handle_fragment(data_writer_t *data, sqfs_block_t *frag)
{
file_info_t *fi = frag->user;
- size_t size;
+ size_t i, size, new_sz;
+ uint64_t signature;
+ void *new;
+
+ frag->checksum = crc32(0, frag->data, frag->size);
+ signature = MK_BLK_SIG(frag->checksum, frag->size);
+
+ for (i = 0; i < data->frag_list_num; ++i) {
+ if (data->frag_list[i].signature == signature) {
+ fi->fragment_offset = data->frag_list[i].offset;
+ fi->fragment = data->frag_list[i].index;
+ fi->flags |= FILE_FLAG_FRAGMENT_IS_DUPLICATE;
+ free(frag);
+ return 0;
+ }
+ }
if (data->frag_block != NULL) {
size = data->frag_block->size + frag->size;
@@ -162,10 +251,28 @@ static int store_fragment(data_writer_t *data, sqfs_block_t *frag)
goto fail;
}
- data->frag_block->flags = SQFS_BLK_DONT_CHECKSUM;
- data->frag_block->flags |= BLK_FRAGMENT_BLOCK;
+ data->frag_block->flags = BLK_FRAGMENT_BLOCK;
+ }
+
+ if (data->frag_list_num == data->frag_list_max) {
+ new_sz = data->frag_list_max * 2;
+ new = realloc(data->frag_list,
+ sizeof(data->frag_list[0]) * new_sz);
+
+ if (new == NULL) {
+ perror("growing fragment checksum table");
+ return -1;
+ }
+
+ data->frag_list = new;
+ data->frag_list_max = new_sz;
}
+ data->frag_list[data->frag_list_num].index = data->num_fragments;
+ data->frag_list[data->frag_list_num].offset = data->frag_block->size;
+ data->frag_list[data->frag_list_num].signature = signature;
+ data->frag_list_num += 1;
+
fi->fragment_offset = data->frag_block->size;
fi->fragment = data->num_fragments;
@@ -186,26 +293,6 @@ static bool is_zero_block(unsigned char *ptr, size_t size)
return ptr[0] == 0 && memcmp(ptr, ptr + 1, size - 1) == 0;
}
-static int handle_fragment(data_writer_t *data, sqfs_block_t *blk)
-{
- file_info_t *fi = blk->user, *ref;
-
- fi->fragment_chksum = crc32(0, blk->data, blk->size);
-
- ref = fragment_by_chksum(fi, fi->fragment_chksum, blk->size,
- data->list, data->super->block_size);
-
- if (ref != NULL) {
- fi->fragment_offset = ref->fragment_offset;
- fi->fragment = ref->fragment;
- fi->flags |= FILE_FLAG_FRAGMENT_IS_DUPLICATE;
- free(blk);
- return 0;
- }
-
- return store_fragment(data, blk);
-}
-
static int add_sentinel_block(data_writer_t *data, file_info_t *fi,
uint32_t flags)
{
@@ -259,9 +346,6 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi,
if (flags & DW_ALLIGN_DEVBLK)
blk_flags |= BLK_ALLIGN;
- fi->next = data->list;
- data->list = fi;
-
for (; file_size > 0; file_size -= diff) {
if (file_size > data->super->block_size) {
diff = data->super->block_size;
@@ -276,8 +360,7 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi,
blk->index = i++;
if (is_zero_block(blk->data, blk->size)) {
- fi->blocks[blk->index].chksum = 0;
- fi->blocks[blk->index].size = 0;
+ fi->block_size[blk->index] = 0;
free(blk);
continue;
}
@@ -411,8 +494,7 @@ int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi,
}
if (is_zero_block(blk->data, blk->size)) {
- fi->blocks[blk->index].chksum = 0;
- fi->blocks[blk->index].size = 0;
+ fi->block_size[blk->index] = 0;
free(blk);
continue;
}
@@ -461,9 +543,39 @@ data_writer_t *data_writer_create(sqfs_super_t *super, sqfs_compressor_t *cmp,
return NULL;
}
+ data->max_blocks = INIT_BLOCK_COUNT;
+ data->frag_list_max = INIT_BLOCK_COUNT;
+
+ data->blocks = alloc_array(sizeof(data->blocks[0]),
+ data->max_blocks);
+
+ if (data->blocks == NULL) {
+ perror("creating data writer");
+ free(data);
+ return NULL;
+ }
+
+ data->frag_list = alloc_array(sizeof(data->frag_list[0]),
+ data->frag_list_max);
+
+ if (data->frag_list == NULL) {
+ perror("creating data writer");
+ free(data->blocks);
+ free(data);
+ return NULL;
+ }
+
data->proc = sqfs_block_processor_create(super->block_size, cmp,
num_jobs, max_backlog, data,
block_callback);
+ if (data->proc == NULL) {
+ perror("creating data block processor");
+ free(data->frag_list);
+ free(data->blocks);
+ free(data);
+ return NULL;
+ }
+
data->cmp = cmp;
data->super = super;
data->file = file;
@@ -475,6 +587,7 @@ void data_writer_destroy(data_writer_t *data)
{
sqfs_block_processor_destroy(data->proc);
free(data->fragments);
+ free(data->blocks);
free(data);
}
diff --git a/lib/sqfshelper/statistics.c b/lib/sqfshelper/statistics.c
index 33ff7cb..0fe325b 100644
--- a/lib/sqfshelper/statistics.c
+++ b/lib/sqfshelper/statistics.c
@@ -30,7 +30,7 @@ void sqfs_print_statistics(fstree_t *fs, sqfs_super_t *super)
}
for (sparse = 0, i = 0; i < num_blocks; ++i) {
- if (fi->blocks[i].size == 0)
+ if (fi->block_size[i] == 0)
sparse += 1;
}
diff --git a/lib/sqfshelper/tree_node_from_inode.c b/lib/sqfshelper/tree_node_from_inode.c
index 68365d0..fee191b 100644
--- a/lib/sqfshelper/tree_node_from_inode.c
+++ b/lib/sqfshelper/tree_node_from_inode.c
@@ -25,7 +25,7 @@ static size_t compute_size(sqfs_inode_generic_t *inode, const char *name)
case SQFS_INODE_EXT_FILE:
size += sizeof(file_info_t);
size += inode->num_file_blocks *
- sizeof(((file_info_t *)0)->blocks[0]);
+ sizeof(((file_info_t *)0)->block_size[0]);
break;
case SQFS_INODE_SLINK:
case SQFS_INODE_EXT_SLINK:
@@ -51,10 +51,10 @@ static void copy_block_sizes(sqfs_inode_generic_t *inode, tree_node_t *out,
}
out->name += inode->num_file_blocks *
- sizeof(out->data.file->blocks[0]);
+ sizeof(out->data.file->block_size[0]);
for (i = 0; i < inode->num_file_blocks; ++i)
- out->data.file->blocks[i].size = inode->block_sizes[i];
+ out->data.file->block_size[i] = inode->block_sizes[i];
}
tree_node_t *tree_node_from_inode(sqfs_inode_generic_t *inode,
diff --git a/lib/sqfshelper/tree_node_to_inode.c b/lib/sqfshelper/tree_node_to_inode.c
index 7de8abd..cc76a8d 100644
--- a/lib/sqfshelper/tree_node_to_inode.c
+++ b/lib/sqfshelper/tree_node_to_inode.c
@@ -120,7 +120,7 @@ sqfs_inode_generic_t *tree_node_to_inode(fstree_t *fs, sqfs_id_table_t *idtbl,
for (i = 0; i < block_count; ++i) {
inode->block_sizes[i] =
- node->data.file->blocks[i].size;
+ node->data.file->block_size[i];
}
}