From 256c2458a4fa298c876d8e4a4450cb9a0834b877 Mon Sep 17 00:00:00 2001
From: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
Date: Sat, 27 Jul 2019 00:19:13 +0200
Subject: Implement data block deduplication

The strategy is as follows:
 - At the beginning of every file, remember the current position
 - Once a file is done scan the list of existing files for the following:
   - Look for an existing file that has a block with the same size and
     checksum as the first non-sparse block of the current file
   - After that, every block in the current file has to match in size and
     checksum the ones in the file that we found, from that point onward
   - sparse blocks in either file are skipped
 - If we found a match, we update the current file to point to the first
   matching block and rewind the squashfs image to remove the newly written
   data

This strategy should in theory be able to find an existing file where the
on-disk data *contains* the on-disk data of the current file.

Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
---
 lib/sqfs/data_writer.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 145 insertions(+), 6 deletions(-)

diff --git a/lib/sqfs/data_writer.c b/lib/sqfs/data_writer.c
index c350526..496d636 100644
--- a/lib/sqfs/data_writer.c
+++ b/lib/sqfs/data_writer.c
@@ -7,6 +7,7 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
 #include <stdio.h>
 #include <errno.h>
 
@@ -153,6 +154,114 @@ static file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum,
 	return it;
 }
 
+static size_t get_block_count(file_info_t *fi, size_t block_size)
+{
+	size_t count = fi->size / block_size;
+
+	if ((fi->size % block_size != 0) &&
+	    (fi->fragment == 0xFFFFFFFF ||
+	     fi->fragment_offset == 0xFFFFFFFF)) {
+		++count;
+	}
+
+	while (count > 0 && fi->blocks[count - 1].size == 0)
+		--count;
+
+	return count;
+}
+
+static size_t find_first_match(file_info_t *file, file_info_t *cmp,
+			       size_t idx, size_t cmp_blk_count)
+{
+	size_t i;
+
+	for (i = 0; i < cmp_blk_count; ++i) {
+		if (memcmp(file->blocks + idx, cmp->blocks + i,
+			   sizeof(file->blocks[idx])) == 0) {
+			break;
+		}
+	}
+
+	return i;
+}
+
+static uint64_t find_equal_blocks(file_info_t *file, file_info_t *list,
+				  size_t block_size)
+{
+	size_t start, first_match, i, j, block_count, cmp_blk_count;
+	uint64_t location;
+	file_info_t *it;
+
+	block_count = get_block_count(file, block_size);
+	if (block_count == 0)
+		return 0;
+
+	for (start = 0; start < block_count; ++start) {
+		if (file->blocks[start].size != 0)
+			break;
+	}
+
+	location = 0;
+
+	for (it = list; it != NULL; it = it->next) {
+		if (it == file) {
+			it = NULL;
+			break;
+		}
+
+		/* XXX: list assumed to be processed in order, prevent us from
+		   re-checking files we know are duplicates */
+		if (it->startblock < location)
+			continue;
+
+		location = it->startblock;
+
+		cmp_blk_count = get_block_count(it, block_size);
+		if (cmp_blk_count == 0)
+			continue;
+
+		first_match = find_first_match(file, it, start, cmp_blk_count);
+		if (first_match == cmp_blk_count)
+			continue;
+
+		i = start;
+		j = first_match;
+
+		while (i < block_count && j < cmp_blk_count) {
+			if (file->blocks[i].size == 0) {
+				++i;
+				continue;
+			}
+
+			if (it->blocks[j].size == 0) {
+				++j;
+				continue;
+			}
+
+			if (memcmp(it->blocks + j, file->blocks + i,
+				   sizeof(file->blocks[i])) != 0) {
+				break;
+			}
+
+			++i;
+			++j;
+		}
+
+		if (i == block_count)
+			break;
+	}
+
+	if (it == NULL)
+		return 0;
+
+	location = it->startblock;
+
+	for (i = 0; i < first_match; ++i)
+		location += it->blocks[i].size & ((1 << 24) - 1);
+
+	return location;
+}
+
 static int flush_data_block(data_writer_t *data, size_t size,
 			    file_info_t *fi, int flags, file_info_t *list)
 {
@@ -204,8 +313,13 @@ static int flush_data_block(data_writer_t *data, size_t size,
 	return 0;
 }
 
-static int begin_file(data_writer_t *data, file_info_t *fi, int flags)
+static int begin_file(data_writer_t *data, file_info_t *fi,
+		      int flags, off_t *start)
 {
+	*start = lseek(data->outfd, 0, SEEK_CUR);
+	if (*start == (off_t)-1)
+		goto fail_seek;
+
 	if ((flags & DW_ALLIGN_DEVBLK) && allign_file(data) != 0)
 		return -1;
 
@@ -213,14 +327,37 @@ static int begin_file(data_writer_t *data, file_info_t *fi, int flags)
 	fi->sparse = 0;
 	data->block_idx = 0;
 	return 0;
+fail_seek:
+	perror("querying current position on squashfs image");
+	return -1;
 }
 
-static int end_file(data_writer_t *data, int flags)
+static int end_file(data_writer_t *data, file_info_t *fi,
+		    off_t start, int flags, file_info_t *list)
 {
+	uint64_t ref;
+
 	if ((flags & DW_ALLIGN_DEVBLK) && allign_file(data) != 0)
 		return -1;
 
+	ref = find_equal_blocks(fi, list, data->super->block_size);
+
+	if (ref > 0) {
+		fi->startblock = ref;
+
+		if (lseek(data->outfd, start, SEEK_SET) == (off_t)-1)
+			goto fail_seek;
+
+		if (ftruncate(data->outfd, start))
+			goto fail_truncate;
+	}
 	return 0;
+fail_seek:
+	perror("seeking on squashfs image after file deduplication");
+	return -1;
+fail_truncate:
+	perror("truncating squashfs image after file deduplication");
+	return -1;
 }
 
 int write_data_from_fd(data_writer_t *data, file_info_t *fi,
@@ -228,8 +365,9 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi,
 {
 	uint64_t count;
 	size_t diff;
+	off_t start;
 
-	if (begin_file(data, fi, flags))
+	if (begin_file(data, fi, flags, &start))
 		return -1;
 
 	for (count = fi->size; count != 0; count -= diff) {
@@ -243,7 +381,7 @@ int write_data_from_fd(data_writer_t *data, file_info_t *fi,
 			return -1;
 	}
 
-	return end_file(data, flags);
+	return end_file(data, fi, start, flags, list);
 }
 
 int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi,
@@ -253,8 +391,9 @@ int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi,
 	size_t start, count, diff;
 	sparse_map_t *m;
 	uint64_t offset;
+	off_t location;
 
-	if (begin_file(data, fi, flags))
+	if (begin_file(data, fi, flags, &location))
 		return -1;
 
 	if (map != NULL) {
@@ -304,7 +443,7 @@ int write_data_from_fd_condensed(data_writer_t *data, file_info_t *fi,
 			return -1;
 	}
 
-	return end_file(data, flags);
+	return end_file(data, fi, location, flags, list);
 fail_map_size:
 	fprintf(stderr, "%s: sparse file map spans beyond file size\n",
 		fi->input_file);
-- 
cgit v1.2.3