aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-07-27 00:19:13 +0200
committerDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-07-28 16:33:57 +0200
commit256c2458a4fa298c876d8e4a4450cb9a0834b877 (patch)
treeb8e619b55d0bd497010effce5a475b960d5bb845 /lib
parentcce36f459ddb5698fd1a40061c466996482146eb (diff)
Implement data block deduplication
The strategy is as follows: - At the beginning of every file, remember the current position - Once a file is done scan the list of existing files for the following: - Look for an existing file that has a block with the same size and checksum as the first non-sparse block of the current file - After that, every block in the current file has to match in size and checksum the ones in the file that we found, from that point onward - sparse blocks in either file are skipped - If we found a match, we update the current file to point to the first matching block and rewind the squashfs image to remove the newly written data This strategy should in theory be able to find an existing file where the on-disk data *contains* the on-disk data of the current file. Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
Diffstat (limited to 'lib')
-rw-r--r--lib/sqfs/data_writer.c151
1 files changed, 145 insertions, 6 deletions
diff --git a/lib/sqfs/data_writer.c b/lib/sqfs/data_writer.c
index c350526..496d636 100644
--- a/lib/sqfs/data_writer.c
+++ b/lib/sqfs/data_writer.c
@@ -7,6 +7,7 @@
#include <stdlib.h>
#include <string.h>
+#include <unistd.h>
#include <stdio.h>
#include <errno.h>
@@ -153,6 +154,114 @@ static file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum,
return it;
}
+static size_t get_block_count(file_info_t *fi, size_t block_size)
+{
+ size_t count = fi->size / block_size;
+
+ if ((fi->size % block_size != 0) &&
+ (fi->fragment == 0xFFFFFFFF ||
+ fi->fragment_offset == 0xFFFFFFFF)) {
+ ++count;
+ }
+
+ while (count > 0 && fi->blocks[count - 1].size == 0)
+ --count;
+
+ return count;
+}
+
+static size_t find_first_match(file_info_t *file, file_info_t *cmp,
+ size_t idx, size_t cmp_blk_count)
+{
+ size_t i;
+
+ for (i = 0; i < cmp_blk_count; ++i) {
+ if (memcmp(file->blocks + idx, cmp->blocks + i,
+ sizeof(file->blocks[idx])) == 0) {
+ break;
+ }
+ }
+
+ return i;
+}
+
+static uint64_t find_equal_blocks(file_info_t *file, file_info_t *list,
+ size_t block_size)
+{
+ size_t start, first_match, i, j, block_count, cmp_blk_count;
+ uint64_t location;
+ file_info_t *it;
+
+ block_count = get_block_count(file, block_size);
+ if (block_count == 0)
+ return 0;
+
+ for (start = 0; start < block_count; ++start) {
+ if (file->blocks[start].size != 0)
+ break;
+ }
+
+ location = 0;
+
+ for (it = list; it != NULL; it = it->next) {
+ if (it == file) {
+ it = NULL;
+ break;
+ }
+
+ /* XXX: list assumed to be processed in order, prevent us from
+ re-checking files we know are duplicates */
+ if (it->startblock < location)
+ continue;
+
+ location = it->startblock;
+
+ cmp_blk_count = get_block_count(it, block_size);
+ if (cmp_blk_count == 0)
+ continue;
+
+ first_match = find_first_match(file, it, start, cmp_blk_count);
+ if (first_match == cmp_blk_count)
+ continue;
+
+ i = start;
+ j = first_match;
+
+ while (i < block_count && j < cmp_blk_count) {
+ if (file->blocks[i].size == 0) {
+ ++i;
+ continue;
+ }
+
+ if (it->blocks[j].size == 0) {
+ ++j;
+ continue;
+ }
+
+ if (memcmp(it->blocks + j, file->blocks + i,
+ sizeof(file->blocks[i])) != 0) {
+ break;
+ }
+
+ ++i;
+ ++j;
+ }
+
+ if (i == block_count)
+ break;
+ }
+
+ if (it == NULL)
+ return 0;
+
+ location = it->startblock;
+
+ for (i = 0; i < first_match; ++i)
+ location += it->blocks[i].size & ((1 << 24) - 1);
+
+ return location;
+}
+
static int flush_data_block(data_writer_t *data, size_t size,
file_info_t *fi, int flags, file_info_t *list)
{
@@ -204,8 +313,13 @@ static int flush_data_block(data_writer_t *data, size_t size,
return 0;
}
-static int begin_file(data_writer_t *data, file_info_t *fi, int flags)
+static int begin_file(data_writer_t *data, file_info_t *fi,
+ int flags, off_t *start)
{
+ *start = lseek(data->outfd, 0, SEEK_CUR);
+ if (*start == (off_t)-1)
+ goto fail_seek;
+
if ((flags & DW_ALLIGN_DEVBLK) && allign_file(data) != 0)
return -1;
@@ -213,14 +327,37 @@ static int begin_file(data_writer_t *data, file_info_t *fi, int flags)
fi->sparse = 0;
data->block_idx = 0;
return 0;
+fail_seek:
+ perror("querying current position on squashfs image");
+ return -1;
}
-static int end_file(data_writer_t *data, int flags)
+static int end_file(data_writer_t *data, file_info_t *fi,
+ off_t start, int flags, file_info_t *list)
{
+ uint64_t ref;
+
if ((flags & DW_ALLIGN_DEVBLK) && allign_file(data) != 0)
return -1;
+ ref = find_equal_blocks(fi, list, data->super->block_size);
+
+ if (ref > 0) {
+ fi->startblock = ref;
+
+ if (lseek(data->outfd, start, SEEK_SET) == (off_t)-1)
+ goto fail_seek;
+
+ if (ftruncate(data->outfd, start))
+ goto fail_truncate;
+ }
return 0;
+fail_seek:
+ perror("seeking on squashfs image after file deduplication");
+ return -1;
+fail_truncate:
+ perror("truncating squashfs image after file deduplication");
+ return -1;
}
int write_data_from_fd(data_writer_t *data, file_info_t *fi,
@@ -228,8 +365,9 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi,
{
uint64_t count;
size_t diff;
+ off_t start;
- if (begin_file(data, fi, flags))
+ if (begin_file(data, fi, flags, &start))
return -1;
for (count = fi->size; count != 0; count -= diff) {
@@ -243,7 +381,7 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi,
return -1;
}
- return end_file(data, flags);
+ return end_file(data, fi, start, flags, list);
}
int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi,
@@ -253,8 +391,9 @@ int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi,
size_t start, count, diff;
sparse_map_t *m;
uint64_t offset;
+ off_t location;
- if (begin_file(data, fi, flags))
+ if (begin_file(data, fi, flags, &location))
return -1;
if (map != NULL) {
@@ -304,7 +443,7 @@ int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi,
return -1;
}
- return end_file(data, flags);
+ return end_file(data, fi, location, flags, list);
fail_map_size:
fprintf(stderr, "%s: sparse file map spans beyond file size\n",
fi->input_file);