From cce36f459ddb5698fd1a40061c466996482146eb Mon Sep 17 00:00:00 2001 From: David Oberhollenzer Date: Fri, 26 Jul 2019 23:07:41 +0200 Subject: Implement fragment deduplication in data writer The strategy is simple: - The data writer function that write data/fragment blocks get access to the list files. - When writing a fragment, we look for an already written file that has a fragment with the same size and checksum. - If we find one, we throw away the fragment and reuse the existing one. Signed-off-by: David Oberhollenzer --- include/data_writer.h | 13 +++++++++++-- lib/sqfs/data_writer.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- mkfs/mkfs.c | 7 ++++--- tar/tar2sqfs.c | 15 ++++++++++----- 4 files changed, 70 insertions(+), 15 deletions(-) diff --git a/include/data_writer.h b/include/data_writer.h index 7107868..f5ad572 100644 --- a/include/data_writer.h +++ b/include/data_writer.h @@ -66,10 +66,14 @@ int data_writer_flush_fragments(data_writer_t *data); The flags argument is a combination of DW_* flags. + If 'list' is not NULL, it is used for fragment and data block deduplication. + It is assumed that the list is processed in order and scanning stops as soon + as the current file info 'fi' is encountered in the list. + Returns 0 on success, prints errors to stderr. */ int write_data_from_fd(data_writer_t *data, file_info_t *fi, int infd, - int flags); + int flags, file_info_t *list); /* Does the same as write_data_from_fd but the input file is the condensed @@ -78,9 +82,14 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi, int infd, The flags argument is a combination of DW_* flags. + If 'list' is not NULL, it is used for fragment and data block deduplication. + It is assumed that the list is processed in order and scanning stops as soon + as the current file info 'fi' is encountered in the list. + Returns 0 on success, prints errors to stderr. */ int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi, - int infd, sparse_map_t *map, int flags); + int infd, sparse_map_t *map, int flags, + file_info_t *list); #endif /* DATA_WRITER_H */ diff --git a/lib/sqfs/data_writer.c b/lib/sqfs/data_writer.c index 288bdd7..c350526 100644 --- a/lib/sqfs/data_writer.c +++ b/lib/sqfs/data_writer.c @@ -125,10 +125,39 @@ int data_writer_flush_fragments(data_writer_t *data) return 0; } +static file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum, + size_t frag_size, file_info_t *list, + size_t block_size) +{ + file_info_t *it; + + for (it = list; it != NULL; it = it->next) { + if (it == fi) { + it = NULL; + break; + } + + if (it->fragment == 0xFFFFFFFF) + continue; + + if (it->fragment_offset == 0xFFFFFFFF) + continue; + + if ((it->size % block_size) != frag_size) + continue; + + if (it->fragment_chksum == chksum) + break; + } + + return it; +} + static int flush_data_block(data_writer_t *data, size_t size, - file_info_t *fi, int flags) + file_info_t *fi, int flags, file_info_t *list) { uint32_t out, chksum; + file_info_t *ref; if (is_zero_block(data->block, size)) { fi->blocks[data->block_idx].size = 0; @@ -141,6 +170,16 @@ static int flush_data_block(data_writer_t *data, size_t size, chksum = update_crc32(0, data->block, size); if (size < data->super->block_size && !(flags & DW_DONT_FRAGMENT)) { + ref = fragment_by_chksum(fi, chksum, size, list, + data->super->block_size); + + if (ref != NULL) { + fi->fragment_chksum = ref->fragment_chksum; + fi->fragment_offset = ref->fragment_offset; + fi->fragment = ref->fragment; + return 0; + } + if (data->frag_offset + size > data->super->block_size) { if (data_writer_flush_fragments(data)) return -1; @@ -185,7 +224,7 @@ static int end_file(data_writer_t *data, int flags) } int write_data_from_fd(data_writer_t *data, file_info_t *fi, - int infd, int flags) + int infd, int flags, file_info_t *list) { uint64_t count; size_t diff; @@ -200,7 +239,7 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi, if (read_data(fi->input_file, infd, data->block, diff)) return -1; - if (flush_data_block(data, diff, fi, flags)) + if (flush_data_block(data, diff, fi, flags, list)) return -1; } @@ -208,7 +247,8 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi, } int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi, - int infd, sparse_map_t *map, int flags) + int infd, sparse_map_t *map, int flags, + file_info_t *list) { size_t start, count, diff; sparse_map_t *m; @@ -260,7 +300,7 @@ int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi, map = map->next; } - if (flush_data_block(data, diff, fi, flags)) + if (flush_data_block(data, diff, fi, flags, list)) return -1; } diff --git a/mkfs/mkfs.c b/mkfs/mkfs.c index a739322..b5c514b 100644 --- a/mkfs/mkfs.c +++ b/mkfs/mkfs.c @@ -1,7 +1,8 @@ /* SPDX-License-Identifier: GPL-3.0-or-later */ #include "mkfs.h" -static int process_file(data_writer_t *data, file_info_t *fi, bool quiet) +static int process_file(data_writer_t *data, file_info_t *fi, bool quiet, + file_info_t *list) { int ret, infd; @@ -14,7 +15,7 @@ static int process_file(data_writer_t *data, file_info_t *fi, bool quiet) return -1; } - ret = write_data_from_fd(data, fi, infd, 0); + ret = write_data_from_fd(data, fi, infd, 0, list); close(infd); return ret; @@ -50,7 +51,7 @@ static int pack_files(data_writer_t *data, fstree_t *fs, options_t *opt) return -1; for (fi = fs->files; fi != NULL; fi = fi->next) { - if (process_file(data, fi, opt->quiet)) + if (process_file(data, fi, opt->quiet, fs->files)) return -1; } diff --git a/tar/tar2sqfs.c b/tar/tar2sqfs.c index 625cec6..836df21 100644 --- a/tar/tar2sqfs.c +++ b/tar/tar2sqfs.c @@ -194,20 +194,20 @@ fail_arg: } static int write_file(tar_header_decoded_t *hdr, file_info_t *fi, - data_writer_t *data) + data_writer_t *data, file_info_t *list) { int ret; if (hdr->sparse != NULL) { ret = write_data_from_fd_condensed(data, fi, STDIN_FILENO, - hdr->sparse, 0); + hdr->sparse, 0, list); if (ret) return -1; return skip_padding(STDIN_FILENO, hdr->record_size); } - if (write_data_from_fd(data, fi, STDIN_FILENO, 0)) + if (write_data_from_fd(data, fi, STDIN_FILENO, 0, list)) return -1; return skip_padding(STDIN_FILENO, fi->size); @@ -259,8 +259,13 @@ static int create_node_and_repack_data(tar_header_decoded_t *hdr, fstree_t *fs, return -1; } - if (S_ISREG(hdr->sb.st_mode)) - return write_file(hdr, node->data.file, data); + if (S_ISREG(hdr->sb.st_mode)) { + if (write_file(hdr, node->data.file, data, fs->files)) + return -1; + + node->data.file->next = fs->files; + fs->files = node->data.file; + } return 0; fail_errno: -- cgit v1.2.3