diff options
| author | David Oberhollenzer <david.oberhollenzer@sigma-star.at> | 2019-07-29 10:57:40 +0200 | 
|---|---|---|
| committer | David Oberhollenzer <david.oberhollenzer@sigma-star.at> | 2019-07-29 10:58:42 +0200 | 
| commit | 864302059679c15bc223d37ed8cef87c5b4a97aa (patch) | |
| tree | 419cdc8670878602c8b615e98937db5da723ca7a | |
| parent | dad72be359d2670dd4993101591e1b14638dedec (diff) | |
Cleanup: move deduplication code from data writer to fstree
Since it is actually completely independend of libsqfs and only works
on file_info_t lists, it can be safely moved over to libfstree and
the data writer becomes less cluttered as a result.
Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
| -rw-r--r-- | include/fstree.h | 21 | ||||
| -rw-r--r-- | lib/Makemodule.am | 2 | ||||
| -rw-r--r-- | lib/fstree/deduplicate.c | 132 | ||||
| -rw-r--r-- | lib/sqfs/data_writer.c | 127 | 
4 files changed, 154 insertions, 128 deletions
diff --git a/include/fstree.h b/include/fstree.h index 687b693..c327fc6 100644 --- a/include/fstree.h +++ b/include/fstree.h @@ -297,4 +297,25 @@ void tree_node_sort_recursive(tree_node_t *root);  /* resolve a path to a tree node. Returns NULL on failure and sets errno */  tree_node_t *fstree_node_from_path(fstree_t *fs, const char *path); +/* +  Walk through 'list' to find a file with a fragment that has +  the same size ('frag_size') and checksum ('chksum') as 'fi'. +  Processing stopps if 'fi' itself is found in the list. + +  Returns NULL if no such fragment could be found. +*/ +file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum, +				size_t frag_size, file_info_t *list, +				size_t block_size); + +/* +  Walk through 'list' to find a file that contains the same sequence of blocks +  as 'file', comparing size and checksum. Processing stops if 'file' is found +  in the list. + +  Returns NULL if no such fragment could be found. + */ +uint64_t find_equal_blocks(file_info_t *file, file_info_t *list, +			   size_t block_size); +  #endif /* FSTREE_H */ diff --git a/lib/Makemodule.am b/lib/Makemodule.am index 8258136..4288ef1 100644 --- a/lib/Makemodule.am +++ b/lib/Makemodule.am @@ -4,7 +4,7 @@ libfstree_a_SOURCES += lib/fstree/gen_inode_table.c lib/fstree/get_path.c  libfstree_a_SOURCES += lib/fstree/node_stat.c lib/fstree/mknode.c  libfstree_a_SOURCES += lib/fstree/add_by_path.c lib/fstree/xattr.c  libfstree_a_SOURCES += lib/fstree/node_from_path.c include/fstree.h -libfstree_a_SOURCES += lib/fstree/gen_file_list.c +libfstree_a_SOURCES += lib/fstree/gen_file_list.c lib/fstree/deduplicate.c  libfstree_a_CFLAGS = $(AM_CFLAGS)  libfstree_a_CPPFLAGS = $(AM_CPPFLAGS) diff --git a/lib/fstree/deduplicate.c b/lib/fstree/deduplicate.c new file mode 100644 index 0000000..9ae4440 --- /dev/null +++ b/lib/fstree/deduplicate.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-3.0-or-later */ +#include "config.h" +#include "fstree.h" + +#include <string.h> + +file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum, +				size_t frag_size, file_info_t *list, +				size_t block_size) +{ +	file_info_t *it; + +	for (it = list; it != NULL; it = it->next) { +		if (it == fi) { +			it = NULL; +			break; +		} + +		if (!(it->flags & FILE_FLAG_HAS_FRAGMENT)) +			continue; + +		if (it->flags & FILE_FLAG_FRAGMENT_IS_DUPLICATE) +			continue; + +		if ((it->size % block_size) != frag_size) +			continue; + +		if (it->fragment_chksum == chksum) +			break; +	} + +	return it; +} + +static size_t get_block_count(file_info_t *fi, size_t block_size) +{ +	size_t count = fi->size / block_size; + +	if ((fi->size % block_size) && !(fi->flags & FILE_FLAG_HAS_FRAGMENT)) +		++count; + +	while (count > 0 && fi->blocks[count - 1].size == 0) +		--count; + +	return count; +} + +static size_t find_first_match(file_info_t *file, file_info_t *cmp, +			       size_t idx, size_t cmp_blk_count) +{ +	size_t i; + +	for (i = 0; i < cmp_blk_count; ++i) { +		if (memcmp(file->blocks + idx, cmp->blocks + i, +			   sizeof(file->blocks[idx])) == 0) { +			break; +		} +	} + +	return i; +} + +uint64_t find_equal_blocks(file_info_t *file, file_info_t *list, +			   size_t block_size) +{ +	size_t start, first_match, i, j, block_count, cmp_blk_count; +	uint64_t location; +	file_info_t *it; + +	block_count = get_block_count(file, block_size); +	if (block_count == 0) +		return 0; + +	for (start = 0; start < block_count; ++start) { +		if (file->blocks[start].size != 0) +			break; +	} + +	for (it = list; it != NULL; it = it->next) { +		if (it == file) { +			it = NULL; +			break; +		} + +		if (it->flags & FILE_FLAG_BLOCKS_ARE_DUPLICATE) +			continue; + +		cmp_blk_count = get_block_count(it, block_size); +		if (cmp_blk_count == 0) +			continue; + +		first_match = find_first_match(file, it, start, cmp_blk_count); +		if (first_match == cmp_blk_count) +			continue; + +		i = start; +		j = first_match; + +		while (i < block_count && j < cmp_blk_count) { +			if (file->blocks[i].size == 0) { +				++i; +				continue; +			} + +			if (it->blocks[j].size == 0) { +				++j; +				continue; +			} + +			if (memcmp(it->blocks + j, file->blocks + i, +				   sizeof(file->blocks[i])) != 0) { +				break; +			} + +			++i; +			++j; +		} + +		if (i == block_count) +			break; +	} + +	if (it == NULL) +		return 0; + +	location = it->startblock; + +	for (i = 0; i < first_match; ++i) +		location += it->blocks[i].size & ((1 << 24) - 1); + +	return location; +} diff --git a/lib/sqfs/data_writer.c b/lib/sqfs/data_writer.c index fddb9d3..4324f06 100644 --- a/lib/sqfs/data_writer.c +++ b/lib/sqfs/data_writer.c @@ -126,133 +126,6 @@ int data_writer_flush_fragments(data_writer_t *data)  	return 0;  } -static file_info_t *fragment_by_chksum(file_info_t *fi, uint32_t chksum, -				       size_t frag_size, file_info_t *list, -				       size_t block_size) -{ -	file_info_t *it; - -	for (it = list; it != NULL; it = it->next) { -		if (it == fi) { -			it = NULL; -			break; -		} - -		if (!(it->flags & FILE_FLAG_HAS_FRAGMENT)) -			continue; - -		if (it->flags & FILE_FLAG_FRAGMENT_IS_DUPLICATE) -			continue; - -		if ((it->size % block_size) != frag_size) -			continue; - -		if (it->fragment_chksum == chksum) -			break; -	} - -	return it; -} - -static size_t get_block_count(file_info_t *fi, size_t block_size) -{ -	size_t count = fi->size / block_size; - -	if ((fi->size % block_size) && !(fi->flags & FILE_FLAG_HAS_FRAGMENT)) -		++count; - -	while (count > 0 && fi->blocks[count - 1].size == 0) -		--count; - -	return count; -} - -static size_t find_first_match(file_info_t *file, file_info_t *cmp, -			       size_t idx, size_t cmp_blk_count) -{ -	size_t i; - -	for (i = 0; i < cmp_blk_count; ++i) { -		if (memcmp(file->blocks + idx, cmp->blocks + i, -			   sizeof(file->blocks[idx])) == 0) { -			break; -		} -	} - -	return i; -} - -static uint64_t find_equal_blocks(file_info_t *file, file_info_t *list, -				  size_t block_size) -{ -	size_t start, first_match, i, j, block_count, cmp_blk_count; -	uint64_t location; -	file_info_t *it; - -	block_count = get_block_count(file, block_size); -	if (block_count == 0) -		return 0; - -	for (start = 0; start < block_count; ++start) { -		if (file->blocks[start].size != 0) -			break; -	} - -	for (it = list; it != NULL; it = it->next) { -		if (it == file) { -			it = NULL; -			break; -		} - -		if (it->flags & FILE_FLAG_BLOCKS_ARE_DUPLICATE) -			continue; - -		cmp_blk_count = get_block_count(it, block_size); -		if (cmp_blk_count == 0) -			continue; - -		first_match = find_first_match(file, it, start, cmp_blk_count); -		if (first_match == cmp_blk_count) -			continue; - -		i = start; -		j = first_match; - -		while (i < block_count && j < cmp_blk_count) { -			if (file->blocks[i].size == 0) { -				++i; -				continue; -			} - -			if (it->blocks[j].size == 0) { -				++j; -				continue; -			} - -			if (memcmp(it->blocks + j, file->blocks + i, -				   sizeof(file->blocks[i])) != 0) { -				break; -			} - -			++i; -			++j; -		} - -		if (i == block_count) -			break; -	} - -	if (it == NULL) -		return 0; - -	location = it->startblock; - -	for (i = 0; i < first_match; ++i) -		location += it->blocks[i].size & ((1 << 24) - 1); - -	return location; -} -  static int flush_data_block(data_writer_t *data, size_t size,  			    file_info_t *fi, int flags, file_info_t *list)  {  | 
