From f57814332a69bebc40e25e6537a3c08fc9e18f97 Mon Sep 17 00:00:00 2001 From: David Oberhollenzer Date: Sat, 14 Sep 2019 01:41:38 +0200 Subject: Move data deduplication from fstree code to data writer Signed-off-by: David Oberhollenzer --- include/fstree.h | 29 +---- lib/fstree/Makemodule.am | 2 +- lib/fstree/deduplicate.c | 133 ---------------------- lib/fstree/mknode.c | 6 +- lib/sqfshelper/data_reader.c | 14 +-- lib/sqfshelper/data_writer.c | 203 ++++++++++++++++++++++++++-------- lib/sqfshelper/statistics.c | 2 +- lib/sqfshelper/tree_node_from_inode.c | 6 +- lib/sqfshelper/tree_node_to_inode.c | 2 +- mkfs/mkfs.c | 5 +- tests/mknode_reg.c | 2 +- 11 files changed, 178 insertions(+), 226 deletions(-) delete mode 100644 lib/fstree/deduplicate.c diff --git a/include/fstree.h b/include/fstree.h index 7ee105b..8d0b952 100644 --- a/include/fstree.h +++ b/include/fstree.h @@ -99,18 +99,12 @@ struct file_info_t { /* Byte offset into the fragment block. */ uint32_t fragment_offset; - uint32_t fragment_chksum; - /* combination of FILE_FLAG_* flags */ uint32_t flags; /* Stores data about each full data block. */ - struct { - uint32_t chksum; - - /* Bit (1 << 24) is set if the block is stored uncompressed. */ - uint32_t size; - } blocks[]; + /* Bit (1 << 24) is set if the block is stored uncompressed. */ + uint32_t block_size[]; }; /* Additional meta data stored in a tree_node_t for directories */ @@ -311,25 +305,6 @@ void tree_node_sort_recursive(tree_node_t *root); /* resolve a path to a tree node. Returns NULL on failure and sets errno */ tree_node_t *fstree_node_from_path(fstree_t *fs, const char *path); -/* - Walk through 'list' to find a file with a fragment that has - the same size ('frag_size') and checksum ('chksum') as 'fi'. - - Returns NULL if no such fragment could be found. -*/ -file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum, - size_t frag_size, file_info_t *list, - size_t block_size); - -/* - Walk through 'list' to find a file that contains the same sequence of blocks - as 'file', comparing size and checksum. - - Returns NULL if no such fragment could be found. - */ -uint64_t find_equal_blocks(file_info_t *file, file_info_t *list, - size_t block_size); - /* Optimize the order of the fstree file list for unpacking as to avoid unpacking fragment blocks more than once and to improve locality when diff --git a/lib/fstree/Makemodule.am b/lib/fstree/Makemodule.am index 3c74f6d..c3c56fb 100644 --- a/lib/fstree/Makemodule.am +++ b/lib/fstree/Makemodule.am @@ -4,7 +4,7 @@ libfstree_a_SOURCES += lib/fstree/gen_inode_table.c lib/fstree/get_path.c libfstree_a_SOURCES += lib/fstree/node_stat.c lib/fstree/mknode.c libfstree_a_SOURCES += lib/fstree/add_by_path.c lib/fstree/xattr.c libfstree_a_SOURCES += lib/fstree/node_from_path.c include/fstree.h -libfstree_a_SOURCES += lib/fstree/gen_file_list.c lib/fstree/deduplicate.c +libfstree_a_SOURCES += lib/fstree/gen_file_list.c libfstree_a_SOURCES += lib/fstree/optimize_unpack_order.c libfstree_a_SOURCES += lib/fstree/canonicalize_name.c libfstree_a_SOURCES += lib/fstree/source_date_epoch.c diff --git a/lib/fstree/deduplicate.c b/lib/fstree/deduplicate.c deleted file mode 100644 index 00815a6..0000000 --- a/lib/fstree/deduplicate.c +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-3.0-or-later */ -/* - * deduplicate.c - * - * Copyright (C) 2019 David Oberhollenzer - */ -#include "config.h" -#include "fstree.h" - -#include - -file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum, - size_t frag_size, file_info_t *list, - size_t block_size) -{ - file_info_t *it; - - for (it = list; it != NULL; it = it->next) { - if (it == fi) - continue; - - if (!(it->flags & FILE_FLAG_HAS_FRAGMENT)) - continue; - - if (it->flags & FILE_FLAG_FRAGMENT_IS_DUPLICATE) - continue; - - if ((it->size % block_size) != frag_size) - continue; - - if (it->fragment_chksum == chksum) - break; - } - - return it; -} - -static size_t get_block_count(file_info_t *fi, size_t block_size) -{ - size_t count = fi->size / block_size; - - if ((fi->size % block_size) && !(fi->flags & FILE_FLAG_HAS_FRAGMENT)) - ++count; - - while (count > 0 && fi->blocks[count - 1].size == 0) - --count; - - return count; -} - -static size_t find_first_match(file_info_t *file, file_info_t *cmp, - size_t idx, size_t cmp_blk_count) -{ - size_t i; - - for (i = 0; i < cmp_blk_count; ++i) { - if (memcmp(file->blocks + idx, cmp->blocks + i, - sizeof(file->blocks[idx])) == 0) { - break; - } - } - - return i; -} - -uint64_t find_equal_blocks(file_info_t *file, file_info_t *list, - size_t block_size) -{ - size_t start, first_match, i, j, block_count, cmp_blk_count; - uint64_t location; - file_info_t *it; - - block_count = get_block_count(file, block_size); - if (block_count == 0) - return 0; - - for (start = 0; start < block_count; ++start) { - if (file->blocks[start].size != 0) - break; - } - - for (it = list; it != NULL; it = it->next) { - if (it == file) - continue; - - if (it->flags & FILE_FLAG_BLOCKS_ARE_DUPLICATE) - continue; - - cmp_blk_count = get_block_count(it, block_size); - if (cmp_blk_count == 0) - continue; - - first_match = find_first_match(file, it, start, cmp_blk_count); - if (first_match == cmp_blk_count) - continue; - - i = start; - j = first_match; - - while (i < block_count && j < cmp_blk_count) { - if (file->blocks[i].size == 0) { - ++i; - continue; - } - - if (it->blocks[j].size == 0) { - ++j; - continue; - } - - if (memcmp(it->blocks + j, file->blocks + i, - sizeof(file->blocks[i])) != 0) { - break; - } - - ++i; - ++j; - } - - if (i == block_count) - break; - } - - if (it == NULL) - return 0; - - location = it->startblock; - - for (i = 0; i < first_match; ++i) - location += it->blocks[i].size & ((1 << 24) - 1); - - return location; -} diff --git a/lib/fstree/mknode.c b/lib/fstree/mknode.c index 1c3b3a0..ace99f3 100644 --- a/lib/fstree/mknode.c +++ b/lib/fstree/mknode.c @@ -41,7 +41,7 @@ tree_node_t *fstree_mknode(fstree_t *fs, tree_node_t *parent, const char *name, if ((sb->st_size % fs->block_size) != 0) ++block_count; - if (SZ_MUL_OV(block_count, sizeof(n->data.file->blocks[0]), + if (SZ_MUL_OV(block_count, sizeof(n->data.file->block_size[0]), &total)) { goto fail_ov; } @@ -92,8 +92,8 @@ tree_node_t *fstree_mknode(fstree_t *fs, tree_node_t *parent, const char *name, if (extra == NULL) break; - ptr = (char *)n->data.file->blocks; - ptr += block_count * sizeof(n->data.file->blocks[0]); + ptr = (char *)n->data.file->block_size; + ptr += block_count * sizeof(n->data.file->block_size[0]); n->data.file->input_file = ptr; strcpy(n->data.file->input_file, extra); break; diff --git a/lib/sqfshelper/data_reader.c b/lib/sqfshelper/data_reader.c index d1a9cce..6906933 100644 --- a/lib/sqfshelper/data_reader.c +++ b/lib/sqfshelper/data_reader.c @@ -193,7 +193,7 @@ int data_reader_dump_file(data_reader_t *data, file_info_t *fi, int outfd, diff = filesz > data->block_size ? data->block_size : filesz; filesz -= diff; - if (SQFS_IS_SPARSE_BLOCK(fi->blocks[i].size)) { + if (SQFS_IS_SPARSE_BLOCK(fi->block_size[i])) { if (allow_sparse) { if (lseek(outfd, diff, SEEK_CUR) == (off_t)-1) goto fail_sparse; @@ -201,9 +201,9 @@ int data_reader_dump_file(data_reader_t *data, file_info_t *fi, int outfd, } memset(data->block, 0, diff); } else { - if (precache_data_block(data, off, fi->blocks[i].size)) + if (precache_data_block(data, off, fi->block_size[i])) return -1; - off += SQFS_ON_DISK_BLOCK_SIZE(fi->blocks[i].size); + off += SQFS_ON_DISK_BLOCK_SIZE(fi->block_size[i]); } if (write_data("writing uncompressed block", @@ -259,7 +259,7 @@ ssize_t data_reader_read(data_reader_t *data, file_info_t *fi, i = 0; while (offset > data->block_size && i < count) { - off += SQFS_ON_DISK_BLOCK_SIZE(fi->blocks[i++].size); + off += SQFS_ON_DISK_BLOCK_SIZE(fi->block_size[i++]); offset -= data->block_size; } @@ -269,14 +269,14 @@ ssize_t data_reader_read(data_reader_t *data, file_info_t *fi, if (size < diff) size = diff; - if (SQFS_IS_SPARSE_BLOCK(fi->blocks[i].size)) { + if (SQFS_IS_SPARSE_BLOCK(fi->block_size[i])) { memset(buffer, 0, diff); } else { - if (precache_data_block(data, off, fi->blocks[i].size)) + if (precache_data_block(data, off, fi->block_size[i])) return -1; memcpy(buffer, (char *)data->block + offset, diff); - off += SQFS_ON_DISK_BLOCK_SIZE(fi->blocks[i].size); + off += SQFS_ON_DISK_BLOCK_SIZE(fi->block_size[i]); } ++i; diff --git a/lib/sqfshelper/data_writer.c b/lib/sqfshelper/data_writer.c index d6a935b..7d84762 100644 --- a/lib/sqfshelper/data_writer.c +++ b/lib/sqfshelper/data_writer.c @@ -18,6 +18,24 @@ #include #include +#define MK_BLK_SIG(chksum, size) \ + (((uint64_t)(size) << 32) | (uint64_t)(chksum)) + +#define BLK_SIZE(sig) ((sig) >> 32) + +#define INIT_BLOCK_COUNT (128) + +typedef struct { + uint64_t offset; + uint64_t signature; +} blk_info_t; + +typedef struct { + uint32_t index; + uint32_t offset; + uint64_t signature; +} frag_info_t; + struct data_writer_t { sqfs_block_t *frag_block; sqfs_fragment_t *fragments; @@ -29,9 +47,17 @@ struct data_writer_t { sqfs_block_processor_t *proc; sqfs_compressor_t *cmp; - file_info_t *list; sqfs_super_t *super; sqfs_file_t *file; + + size_t file_start; + size_t num_blocks; + size_t max_blocks; + blk_info_t *blocks; + + size_t frag_list_num; + size_t frag_list_max; + frag_info_t *frag_list; }; enum { @@ -47,20 +73,63 @@ static int allign_file(data_writer_t *data) data->devblksz); } +static int store_block_location(data_writer_t *data, uint64_t offset, + uint32_t size, uint32_t chksum) +{ + size_t new_sz; + void *new; + + if (data->num_blocks == data->max_blocks) { + new_sz = data->max_blocks * 2; + new = realloc(data->blocks, sizeof(data->blocks[0]) * new_sz); + + if (new == NULL) { + perror("growing data block checksum table"); + return -1; + } + + data->blocks = new; + data->max_blocks = new_sz; + } + + data->blocks[data->num_blocks].offset = offset; + data->blocks[data->num_blocks].signature = MK_BLK_SIG(chksum, size); + data->num_blocks += 1; + return 0; +} + +static size_t deduplicate_blocks(data_writer_t *data, size_t count) +{ + size_t i, j; + + for (i = 0; i < data->file_start; ++i) { + for (j = 0; j < count; ++j) { + if (data->blocks[i + j].signature != + data->blocks[data->file_start + j].signature) + break; + } + + if (j == count) + break; + } + + return i; +} + static int block_callback(void *user, sqfs_block_t *blk) { file_info_t *fi = blk->user; data_writer_t *data = user; - uint64_t ref, offset; + size_t start, count; + uint64_t offset; uint32_t out; if (blk->flags & BLK_FIRST_BLOCK) { data->start = data->file->get_size(data->file); + data->file_start = data->num_blocks; if ((blk->flags & BLK_ALLIGN) && allign_file(data) != 0) return -1; - - fi->startblock = data->file->get_size(data->file); } if (blk->size != 0) { @@ -68,20 +137,22 @@ static int block_callback(void *user, sqfs_block_t *blk) if (!(blk->flags & SQFS_BLK_IS_COMPRESSED)) out |= 1 << 24; + offset = data->file->get_size(data->file); + if (blk->flags & BLK_FRAGMENT_BLOCK) { - offset = htole64(data->file->get_size(data->file)); - data->fragments[blk->index].start_offset = offset; + data->fragments[blk->index].start_offset = htole64(offset); data->fragments[blk->index].pad0 = 0; data->fragments[blk->index].size = htole32(out); data->super->flags &= ~SQFS_FLAG_NO_FRAGMENTS; data->super->flags |= SQFS_FLAG_ALWAYS_FRAGMENTS; } else { - fi->blocks[blk->index].chksum = blk->checksum; - fi->blocks[blk->index].size = htole32(out); - } + fi->block_size[blk->index] = htole32(out); - offset = data->file->get_size(data->file); + if (store_block_location(data, offset, out, + blk->checksum)) + return -1; + } if (data->file->write_at(data->file, offset, blk->data, blk->size)) { @@ -93,11 +164,14 @@ static int block_callback(void *user, sqfs_block_t *blk) if ((blk->flags & BLK_ALLIGN) && allign_file(data) != 0) return -1; - ref = find_equal_blocks(fi, data->list, - data->super->block_size); - if (ref > 0) { - fi->startblock = ref; + count = data->num_blocks - data->file_start; + start = deduplicate_blocks(data, count); + + fi->startblock = data->blocks[start].offset; + + if (start + count < data->file_start) { fi->flags |= FILE_FLAG_BLOCKS_ARE_DUPLICATE; + data->num_blocks = data->file_start; if (data->file->truncate(data->file, data->start)) { perror("truncating squashfs image after " @@ -139,10 +213,25 @@ static int flush_fragment_block(data_writer_t *data) return ret; } -static int store_fragment(data_writer_t *data, sqfs_block_t *frag) +static int handle_fragment(data_writer_t *data, sqfs_block_t *frag) { file_info_t *fi = frag->user; - size_t size; + size_t i, size, new_sz; + uint64_t signature; + void *new; + + frag->checksum = crc32(0, frag->data, frag->size); + signature = MK_BLK_SIG(frag->checksum, frag->size); + + for (i = 0; i < data->frag_list_num; ++i) { + if (data->frag_list[i].signature == signature) { + fi->fragment_offset = data->frag_list[i].offset; + fi->fragment = data->frag_list[i].index; + fi->flags |= FILE_FLAG_FRAGMENT_IS_DUPLICATE; + free(frag); + return 0; + } + } if (data->frag_block != NULL) { size = data->frag_block->size + frag->size; @@ -162,10 +251,28 @@ static int store_fragment(data_writer_t *data, sqfs_block_t *frag) goto fail; } - data->frag_block->flags = SQFS_BLK_DONT_CHECKSUM; - data->frag_block->flags |= BLK_FRAGMENT_BLOCK; + data->frag_block->flags = BLK_FRAGMENT_BLOCK; + } + + if (data->frag_list_num == data->frag_list_max) { + new_sz = data->frag_list_max * 2; + new = realloc(data->frag_list, + sizeof(data->frag_list[0]) * new_sz); + + if (new == NULL) { + perror("growing fragment checksum table"); + return -1; + } + + data->frag_list = new; + data->frag_list_max = new_sz; } + data->frag_list[data->frag_list_num].index = data->num_fragments; + data->frag_list[data->frag_list_num].offset = data->frag_block->size; + data->frag_list[data->frag_list_num].signature = signature; + data->frag_list_num += 1; + fi->fragment_offset = data->frag_block->size; fi->fragment = data->num_fragments; @@ -186,26 +293,6 @@ static bool is_zero_block(unsigned char *ptr, size_t size) return ptr[0] == 0 && memcmp(ptr, ptr + 1, size - 1) == 0; } -static int handle_fragment(data_writer_t *data, sqfs_block_t *blk) -{ - file_info_t *fi = blk->user, *ref; - - fi->fragment_chksum = crc32(0, blk->data, blk->size); - - ref = fragment_by_chksum(fi, fi->fragment_chksum, blk->size, - data->list, data->super->block_size); - - if (ref != NULL) { - fi->fragment_offset = ref->fragment_offset; - fi->fragment = ref->fragment; - fi->flags |= FILE_FLAG_FRAGMENT_IS_DUPLICATE; - free(blk); - return 0; - } - - return store_fragment(data, blk); -} - static int add_sentinel_block(data_writer_t *data, file_info_t *fi, uint32_t flags) { @@ -259,9 +346,6 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi, if (flags & DW_ALLIGN_DEVBLK) blk_flags |= BLK_ALLIGN; - fi->next = data->list; - data->list = fi; - for (; file_size > 0; file_size -= diff) { if (file_size > data->super->block_size) { diff = data->super->block_size; @@ -276,8 +360,7 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi, blk->index = i++; if (is_zero_block(blk->data, blk->size)) { - fi->blocks[blk->index].chksum = 0; - fi->blocks[blk->index].size = 0; + fi->block_size[blk->index] = 0; free(blk); continue; } @@ -411,8 +494,7 @@ int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi, } if (is_zero_block(blk->data, blk->size)) { - fi->blocks[blk->index].chksum = 0; - fi->blocks[blk->index].size = 0; + fi->block_size[blk->index] = 0; free(blk); continue; } @@ -461,9 +543,39 @@ data_writer_t *data_writer_create(sqfs_super_t *super, sqfs_compressor_t *cmp, return NULL; } + data->max_blocks = INIT_BLOCK_COUNT; + data->frag_list_max = INIT_BLOCK_COUNT; + + data->blocks = alloc_array(sizeof(data->blocks[0]), + data->max_blocks); + + if (data->blocks == NULL) { + perror("creating data writer"); + free(data); + return NULL; + } + + data->frag_list = alloc_array(sizeof(data->frag_list[0]), + data->frag_list_max); + + if (data->frag_list == NULL) { + perror("creating data writer"); + free(data->blocks); + free(data); + return NULL; + } + data->proc = sqfs_block_processor_create(super->block_size, cmp, num_jobs, max_backlog, data, block_callback); + if (data->proc == NULL) { + perror("creating data block processor"); + free(data->frag_list); + free(data->blocks); + free(data); + return NULL; + } + data->cmp = cmp; data->super = super; data->file = file; @@ -475,6 +587,7 @@ void data_writer_destroy(data_writer_t *data) { sqfs_block_processor_destroy(data->proc); free(data->fragments); + free(data->blocks); free(data); } diff --git a/lib/sqfshelper/statistics.c b/lib/sqfshelper/statistics.c index 33ff7cb..0fe325b 100644 --- a/lib/sqfshelper/statistics.c +++ b/lib/sqfshelper/statistics.c @@ -30,7 +30,7 @@ void sqfs_print_statistics(fstree_t *fs, sqfs_super_t *super) } for (sparse = 0, i = 0; i < num_blocks; ++i) { - if (fi->blocks[i].size == 0) + if (fi->block_size[i] == 0) sparse += 1; } diff --git a/lib/sqfshelper/tree_node_from_inode.c b/lib/sqfshelper/tree_node_from_inode.c index 68365d0..fee191b 100644 --- a/lib/sqfshelper/tree_node_from_inode.c +++ b/lib/sqfshelper/tree_node_from_inode.c @@ -25,7 +25,7 @@ static size_t compute_size(sqfs_inode_generic_t *inode, const char *name) case SQFS_INODE_EXT_FILE: size += sizeof(file_info_t); size += inode->num_file_blocks * - sizeof(((file_info_t *)0)->blocks[0]); + sizeof(((file_info_t *)0)->block_size[0]); break; case SQFS_INODE_SLINK: case SQFS_INODE_EXT_SLINK: @@ -51,10 +51,10 @@ static void copy_block_sizes(sqfs_inode_generic_t *inode, tree_node_t *out, } out->name += inode->num_file_blocks * - sizeof(out->data.file->blocks[0]); + sizeof(out->data.file->block_size[0]); for (i = 0; i < inode->num_file_blocks; ++i) - out->data.file->blocks[i].size = inode->block_sizes[i]; + out->data.file->block_size[i] = inode->block_sizes[i]; } tree_node_t *tree_node_from_inode(sqfs_inode_generic_t *inode, diff --git a/lib/sqfshelper/tree_node_to_inode.c b/lib/sqfshelper/tree_node_to_inode.c index 7de8abd..cc76a8d 100644 --- a/lib/sqfshelper/tree_node_to_inode.c +++ b/lib/sqfshelper/tree_node_to_inode.c @@ -120,7 +120,7 @@ sqfs_inode_generic_t *tree_node_to_inode(fstree_t *fs, sqfs_id_table_t *idtbl, for (i = 0; i < block_count; ++i) { inode->block_sizes[i] = - node->data.file->blocks[i].size; + node->data.file->block_size[i]; } } diff --git a/mkfs/mkfs.c b/mkfs/mkfs.c index fda9831..0c4e9a5 100644 --- a/mkfs/mkfs.c +++ b/mkfs/mkfs.c @@ -36,10 +36,7 @@ static int pack_files(data_writer_t *data, fstree_t *fs, options_t *opt) if (set_working_dir(opt)) return -1; - while (fs->files != NULL) { - fi = fs->files; - fs->files = fi->next; - + for (fi = fs->files; fi != NULL; fi = fi->next) { if (!opt->quiet) printf("packing %s\n", fi->input_file); diff --git a/tests/mknode_reg.c b/tests/mknode_reg.c index 7d8e934..65de0b6 100644 --- a/tests/mknode_reg.c +++ b/tests/mknode_reg.c @@ -36,7 +36,7 @@ int main(void) assert((char *)node->name >= (char *)node->payload); assert((char *)node->data.file >= (char *)node->payload); assert(node->data.file->input_file > (char *)(node->data.file + 1) + - sizeof(node->data.file->blocks[0]) * 4); + sizeof(node->data.file->block_size[0]) * 4); assert(node->name >= node->data.file->input_file + 6); assert(strcmp(node->name, "filename") == 0); assert(strcmp(node->data.file->input_file, "input") == 0); -- cgit v1.2.3