From ca61276fe9676670afe657c863257c01274c111a Mon Sep 17 00:00:00 2001 From: David Oberhollenzer Date: Sun, 6 Dec 2020 00:35:01 +0100 Subject: libsqfs: implement exact matching in the default block writer. Instead of comparing (compresed, disk-size, checksum) tuples to find block matches, do an exact, byte-for-byte comparison of the data stored on disk to avoid the possibility of a spurious colision. Since this is the desired behaviour, make it the default, optionally overrideable through a flag. Signed-off-by: David Oberhollenzer --- include/sqfs/block_writer.h | 29 +++++++++++++++- lib/sqfs/block_writer.c | 82 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 104 insertions(+), 7 deletions(-) diff --git a/include/sqfs/block_writer.h b/include/sqfs/block_writer.h index 6352f4d..6376d2e 100644 --- a/include/sqfs/block_writer.h +++ b/include/sqfs/block_writer.h @@ -90,6 +90,32 @@ struct sqfs_block_writer_t { sqfs_u64 (*get_block_count)(const sqfs_block_writer_t *wr); }; +/** + * @enum SQFS_BLOCK_WRITER_FLAGS + * + * @brief Flags that can be passed to @ref sqfs_block_writer_create + */ +typedef enum { + /** + * @brief If set, only compare checksums when deduplicating blocks. + * + * Since squashfs-tools-ng version 1.1, the default for the block + * writer is to compare checksum & size for blocks during deduplication + * and then read the potential match back from disk and do a byte for + * byte comparison to make absolutely sure they match. + * + * If this flag is set, the hash & size check is treated as being + * sufficient for block deduplication, which does increase performance, + * but risks data loss or corruption if a hash collision occours. + */ + SQFS_BLOCK_WRITER_HASH_COMPARE_ONLY = 0x01, + + /** + * @brief A combination of all valid flags. + */ + SQFS_BLOCK_WRITER_ALL_FLAGS = 0x01 +} SQFS_BLOCK_WRITER_FLAGS; + #ifdef __cplusplus extern "C" { #endif @@ -102,7 +128,8 @@ extern "C" { * @param file A pointer to a file object that data should be appended to. * @param devblksz The underlying device block size if output data * should be aligned. - * @param flags Currently must be set to 0 or creation will fail. + * @param flags A combination of @ref SQFS_BLOCK_WRITER_FLAGS values. If an + * unknown flag is set, creation will fail. * * @return A pointer to a new block writer on success, NULL on failure. */ diff --git a/lib/sqfs/block_writer.c b/lib/sqfs/block_writer.c index 07780d7..17d045f 100644 --- a/lib/sqfs/block_writer.c +++ b/lib/sqfs/block_writer.c @@ -14,11 +14,15 @@ #include "util.h" #include +#include #define MK_BLK_HASH(chksum, size) \ (((sqfs_u64)(size) << 32) | (sqfs_u64)(chksum)) +#define SIZE_FROM_HASH(hash) ((hash >> 32) & ((1 << 24) - 1)) + #define INIT_BLOCK_COUNT (128) +#define SCRATCH_SIZE (8192) typedef struct { sqfs_u64 offset; @@ -39,6 +43,10 @@ typedef struct { sqfs_u64 start; size_t file_start; + + sqfs_u32 flags; + + sqfs_u8 scratch[]; } block_writer_default_t; static int store_block_location(block_writer_default_t *wr, sqfs_u64 offset, @@ -64,9 +72,42 @@ static int store_block_location(block_writer_default_t *wr, sqfs_u64 offset, return 0; } -static size_t deduplicate_blocks(block_writer_default_t *wr, size_t count) +static int compare_blocks(block_writer_default_t *wr, sqfs_u64 loc_a, + sqfs_u64 loc_b, size_t size) +{ + sqfs_u8 *ptr_a = wr->scratch, *ptr_b = ptr_a + SCRATCH_SIZE / 2; + size_t diff; + int ret; + + while (size > 0) { + diff = SCRATCH_SIZE / 2; + diff = diff > size ? size : diff; + + ret = wr->file->read_at(wr->file, loc_a, ptr_a, diff); + if (ret != 0) + return ret; + + ret = wr->file->read_at(wr->file, loc_b, ptr_b, diff); + if (ret != 0) + return ret; + + if (memcmp(ptr_a, ptr_b, diff) != 0) + return 1; + + size -= diff; + loc_a += diff; + loc_b += diff; + } + + return 0; +} + +static int deduplicate_blocks(block_writer_default_t *wr, size_t count, + size_t *out) { - size_t i, j; + sqfs_u64 loc_a, loc_b; + size_t i, j, sz; + int ret; for (i = 0; i < wr->file_start; ++i) { for (j = 0; j < count; ++j) { @@ -78,11 +119,31 @@ static size_t deduplicate_blocks(block_writer_default_t *wr, size_t count) break; } + if (j != count) + continue; + + if (wr->flags & SQFS_BLOCK_WRITER_HASH_COMPARE_ONLY) + break; + + for (j = 0; j < count; ++j) { + sz = SIZE_FROM_HASH(wr->blocks[i + j].hash); + + loc_a = wr->blocks[i + j].offset; + loc_b = wr->blocks[wr->file_start + j].offset; + + ret = compare_blocks(wr, loc_a, loc_b, sz); + if (ret < 0) + return ret; + if (ret > 0) + break; + } + if (j == count) break; } - return i; + *out = i; + return 0; } static int align_file(block_writer_default_t *wr) @@ -168,7 +229,10 @@ static int write_data_block(sqfs_block_writer_t *base, void *user, if (count == 0) { *location = 0; } else if (!(flags & SQFS_BLK_DONT_DEDUPLICATE)) { - start = deduplicate_blocks(wr, count); + err = deduplicate_blocks(wr, count, &start); + if (err) + return err; + offset = wr->blocks[start].offset; *location = offset; @@ -204,16 +268,22 @@ sqfs_block_writer_t *sqfs_block_writer_create(sqfs_file_t *file, { block_writer_default_t *wr; - if (flags != 0) + if (flags & ~SQFS_BLOCK_WRITER_ALL_FLAGS) return NULL; - wr = calloc(1, sizeof(*wr)); + if (flags & SQFS_BLOCK_WRITER_HASH_COMPARE_ONLY) { + wr = calloc(1, sizeof(*wr)); + } else { + wr = alloc_flex(sizeof(*wr), 1, SCRATCH_SIZE); + } + if (wr == NULL) return NULL; ((sqfs_block_writer_t *)wr)->write_data_block = write_data_block; ((sqfs_block_writer_t *)wr)->get_block_count = get_block_count; ((sqfs_object_t *)wr)->destroy = block_writer_destroy; + wr->flags = flags; wr->file = file; wr->devblksz = devblksz; wr->max_blocks = INIT_BLOCK_COUNT; -- cgit v1.2.3