aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-07-26 23:07:41 +0200
committerDavid Oberhollenzer <david.oberhollenzer@sigma-star.at>2019-07-28 16:33:57 +0200
commitcce36f459ddb5698fd1a40061c466996482146eb (patch)
tree05037e8f8d18c850a2f8afeac427e3dcada717dc
parente480a7459377817fbab7029a7dad47c969b8dd97 (diff)
Implement fragment deduplication in data writer
The strategy is simple: - The data writer function that write data/fragment blocks get access to the list files. - When writing a fragment, we look for an already written file that has a fragment with the same size and checksum. - If we find one, we throw away the fragment and reuse the existing one. Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
-rw-r--r--include/data_writer.h13
-rw-r--r--lib/sqfs/data_writer.c50
-rw-r--r--mkfs/mkfs.c7
-rw-r--r--tar/tar2sqfs.c15
4 files changed, 70 insertions, 15 deletions
diff --git a/include/data_writer.h b/include/data_writer.h
index 7107868..f5ad572 100644
--- a/include/data_writer.h
+++ b/include/data_writer.h
@@ -66,10 +66,14 @@ int data_writer_flush_fragments(data_writer_t *data);
The flags argument is a combination of DW_* flags.
+ If 'list' is not NULL, it is used for fragment and data block deduplication.
+ It is assumed that the list is processed in order and scanning stops as soon
+ as the current file info 'fi' is encountered in the list.
+
Returns 0 on success, prints errors to stderr.
*/
int write_data_from_fd(data_writer_t *data, file_info_t *fi, int infd,
- int flags);
+ int flags, file_info_t *list);
/*
Does the same as write_data_from_fd but the input file is the condensed
@@ -78,9 +82,14 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi, int infd,
The flags argument is a combination of DW_* flags.
+ If 'list' is not NULL, it is used for fragment and data block deduplication.
+ It is assumed that the list is processed in order and scanning stops as soon
+ as the current file info 'fi' is encountered in the list.
+
Returns 0 on success, prints errors to stderr.
*/
int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi,
- int infd, sparse_map_t *map, int flags);
+ int infd, sparse_map_t *map, int flags,
+ file_info_t *list);
#endif /* DATA_WRITER_H */
diff --git a/lib/sqfs/data_writer.c b/lib/sqfs/data_writer.c
index 288bdd7..c350526 100644
--- a/lib/sqfs/data_writer.c
+++ b/lib/sqfs/data_writer.c
@@ -125,10 +125,39 @@ int data_writer_flush_fragments(data_writer_t *data)
return 0;
}
+static file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum,
+ size_t frag_size, file_info_t *list,
+ size_t block_size)
+{
+ file_info_t *it;
+
+ for (it = list; it != NULL; it = it->next) {
+ if (it == fi) {
+ it = NULL;
+ break;
+ }
+
+ if (it->fragment == 0xFFFFFFFF)
+ continue;
+
+ if (it->fragment_offset == 0xFFFFFFFF)
+ continue;
+
+ if ((it->size % block_size) != frag_size)
+ continue;
+
+ if (it->fragment_chksum == chksum)
+ break;
+ }
+
+ return it;
+}
+
static int flush_data_block(data_writer_t *data, size_t size,
- file_info_t *fi, int flags)
+ file_info_t *fi, int flags, file_info_t *list)
{
uint32_t out, chksum;
+ file_info_t *ref;
if (is_zero_block(data->block, size)) {
fi->blocks[data->block_idx].size = 0;
@@ -141,6 +170,16 @@ static int flush_data_block(data_writer_t *data, size_t size,
chksum = update_crc32(0, data->block, size);
if (size < data->super->block_size && !(flags & DW_DONT_FRAGMENT)) {
+ ref = fragment_by_chksum(fi, chksum, size, list,
+ data->super->block_size);
+
+ if (ref != NULL) {
+ fi->fragment_chksum = ref->fragment_chksum;
+ fi->fragment_offset = ref->fragment_offset;
+ fi->fragment = ref->fragment;
+ return 0;
+ }
+
if (data->frag_offset + size > data->super->block_size) {
if (data_writer_flush_fragments(data))
return -1;
@@ -185,7 +224,7 @@ static int end_file(data_writer_t *data, int flags)
}
int write_data_from_fd(data_writer_t *data, file_info_t *fi,
- int infd, int flags)
+ int infd, int flags, file_info_t *list)
{
uint64_t count;
size_t diff;
@@ -200,7 +239,7 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi,
if (read_data(fi->input_file, infd, data->block, diff))
return -1;
- if (flush_data_block(data, diff, fi, flags))
+ if (flush_data_block(data, diff, fi, flags, list))
return -1;
}
@@ -208,7 +247,8 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi,
}
int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi,
- int infd, sparse_map_t *map, int flags)
+ int infd, sparse_map_t *map, int flags,
+ file_info_t *list)
{
size_t start, count, diff;
sparse_map_t *m;
@@ -260,7 +300,7 @@ int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi,
map = map->next;
}
- if (flush_data_block(data, diff, fi, flags))
+ if (flush_data_block(data, diff, fi, flags, list))
return -1;
}
diff --git a/mkfs/mkfs.c b/mkfs/mkfs.c
index a739322..b5c514b 100644
--- a/mkfs/mkfs.c
+++ b/mkfs/mkfs.c
@@ -1,7 +1,8 @@
/* SPDX-License-Identifier: GPL-3.0-or-later */
#include "mkfs.h"
-static int process_file(data_writer_t *data, file_info_t *fi, bool quiet)
+static int process_file(data_writer_t *data, file_info_t *fi, bool quiet,
+ file_info_t *list)
{
int ret, infd;
@@ -14,7 +15,7 @@ static int process_file(data_writer_t *data, file_info_t *fi, bool quiet)
return -1;
}
- ret = write_data_from_fd(data, fi, infd, 0);
+ ret = write_data_from_fd(data, fi, infd, 0, list);
close(infd);
return ret;
@@ -50,7 +51,7 @@ static int pack_files(data_writer_t *data, fstree_t *fs, options_t *opt)
return -1;
for (fi = fs->files; fi != NULL; fi = fi->next) {
- if (process_file(data, fi, opt->quiet))
+ if (process_file(data, fi, opt->quiet, fs->files))
return -1;
}
diff --git a/tar/tar2sqfs.c b/tar/tar2sqfs.c
index 625cec6..836df21 100644
--- a/tar/tar2sqfs.c
+++ b/tar/tar2sqfs.c
@@ -194,20 +194,20 @@ fail_arg:
}
static int write_file(tar_header_decoded_t *hdr, file_info_t *fi,
- data_writer_t *data)
+ data_writer_t *data, file_info_t *list)
{
int ret;
if (hdr->sparse != NULL) {
ret = write_data_from_fd_condensed(data, fi, STDIN_FILENO,
- hdr->sparse, 0);
+ hdr->sparse, 0, list);
if (ret)
return -1;
return skip_padding(STDIN_FILENO, hdr->record_size);
}
- if (write_data_from_fd(data, fi, STDIN_FILENO, 0))
+ if (write_data_from_fd(data, fi, STDIN_FILENO, 0, list))
return -1;
return skip_padding(STDIN_FILENO, fi->size);
@@ -259,8 +259,13 @@ static int create_node_and_repack_data(tar_header_decoded_t *hdr, fstree_t *fs,
return -1;
}
- if (S_ISREG(hdr->sb.st_mode))
- return write_file(hdr, node->data.file, data);
+ if (S_ISREG(hdr->sb.st_mode)) {
+ if (write_file(hdr, node->data.file, data, fs->files))
+ return -1;
+
+ node->data.file->next = fs->files;
+ fs->files = node->data.file;
+ }
return 0;
fail_errno: