From e5c4f61a622d84c5346d0fec1ed788a16f4d4a01 Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 07:05:30 +1000 Subject: [PATCH 01/17] print RmFile size (size==192) --- lib/session.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/session.c b/lib/session.c index 45bf41e5..edc3b5d7 100644 --- a/lib/session.c +++ b/lib/session.c @@ -76,6 +76,7 @@ bool rm_session_check_kernel_version(int need_major, int need_minor) { } void rm_session_init(RmSession *session, RmCfg *cfg) { + rm_log_warning("Sizeof(RmFile): %lu\n", sizeof(RmFile)); memset(session, 0, sizeof(RmSession)); session->timer = g_timer_new(); From fdf71059056b2d467ba621523411fea238775366 Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 07:01:46 +1000 Subject: [PATCH 02/17] file: retire RmFileState enum in favour of bit flag; sizeof(RmFile)==184 --- lib/file.h | 19 +++---------------- lib/shredder.c | 9 +++------ 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/lib/file.h b/lib/file.h index 671e61fd..aec168a8 100644 --- a/lib/file.h +++ b/lib/file.h @@ -29,18 +29,6 @@ #include "cfg.h" -typedef enum RmFileState { - /* File still processing - */ - RM_FILE_STATE_NORMAL, - - /* File can be ignored, has a unique hash, gets read failure - * or is elsewhise not noteworthy. - */ - RM_FILE_STATE_IGNORE, - -} RmFileState; - /* types of lint */ typedef enum RmLintType { RM_LINT_TYPE_UNKNOWN = 0, @@ -183,6 +171,9 @@ typedef struct RmFile { /* Set to true if was read from [json] cache as an original */ bool cached_original : 1; + /* File hashing failed (probably read error or user interrupt) */ + bool hashing_failed : 1; + /* The pre-matched file cluster that this file belongs to (or NULL) */ GQueue *cluster; @@ -207,10 +198,6 @@ typedef struct RmFile { */ RmOff hash_offset; - /* Flag for when we do intermediate steps within a hash increment because the file is - * fragmented */ - RmFileState status; - /* digest of this file updated on every hash iteration. Use a pointer so we can share * with RmShredGroup */ diff --git a/lib/shredder.c b/lib/shredder.c index 797e1b30..f5dd3db9 100644 --- a/lib/shredder.c +++ b/lib/shredder.c @@ -520,7 +520,6 @@ static gint32 rm_shred_get_read_size(RmFile *file, RmShredTag *tag) { MIN(group->next_offset, group->hash_offset + SHRED_PARANOID_BYTES); } - file->status = RM_FILE_STATE_NORMAL; result = (group->next_offset - file->hash_offset); return result; @@ -994,10 +993,8 @@ static RmFile *rm_shred_sift(RmFile *file) { g_list_remove(current_group->in_progress_digests, file->digest); } - if(file->status == RM_FILE_STATE_IGNORE) { - /* reading/hashing failed somewhere */ + if(file->hashing_failed) { rm_shred_discard_file(file, NULL); - } else { g_assert(file->digest); @@ -1651,7 +1648,7 @@ static gint rm_shred_process_file(RmFile *file, RmSession *session) { RmShredTag *tag = session->shredder; if(rm_session_was_aborted() || session->equal_exit_code == EXIT_FAILURE) { - file->status = RM_FILE_STATE_IGNORE; + file->hashing_failed = TRUE; rm_shred_sift(file); return 1; } @@ -1676,7 +1673,7 @@ static gint rm_shred_process_file(RmFile *file, RmSession *session) { if(!rm_hasher_task_hash(task, file_path, file->hash_offset, bytes_to_read, file->is_symlink, &bytes_read)) { /* rm_hasher_start_increment failed somewhere */ - file->status = RM_FILE_STATE_IGNORE; + file->hashing_failed = TRUE; shredder_waiting = FALSE; } From fc7f406b9d2598d1c0e5f65401c48c82180d70fe Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 07:51:10 +1000 Subject: [PATCH 03/17] file: re-order fields within struct -> sizeof(RmFile)==168 --- lib/file.h | 182 +++++++++++++++++++++++++++++------------------------ 1 file changed, 99 insertions(+), 83 deletions(-) diff --git a/lib/file.h b/lib/file.h index aec168a8..7c750484 100644 --- a/lib/file.h +++ b/lib/file.h @@ -95,18 +95,100 @@ struct RmDirectory; * RmFile structure; used by pretty much all rmlint modules. */ typedef struct RmFile { + + /*----- 64-bit types ----- */ + /* file path lookup ID (if using swap table) * */ RmOff path_id; - /* file path as node of folder n-ary tree - * */ - RmNode *node; + /* The index of the path this file belongs to. */ + RmOff path_index; + + /* Filesize in bytes; this may be less than actual_file_size, + * since -q / -Q may limit this number. + */ + RmOff file_size; + + /* Filesize of a file when it was traversed by rmlint. + */ + RmOff actual_file_size; + + /* How many bytes were already read. + * (lower or equal file_size) + */ + RmOff hash_offset; + + /* Those are never used at the same time. + * disk_offset is used during computation, + * twin_count during output. + */ + union { + /* Count of twins of this file. + * (i.e. length of group of this file) + */ + gint64 twin_count; + + /* Disk fiemap / physical offset at start of file (tests mapping subsequent + * file fragments did not deliver any significant additionl benefit) */ + RmOff disk_offset; + }; /* File modification date/time * */ gdouble mtime; + /* Number of children this file has. + * Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY. + * */ + size_t n_children; + + + /*----- pointer types ----- */ + + /* The pre-matched file cluster that this file belongs to (or NULL) */ + GQueue *cluster; + + /* pointer to hardlinks collection (or NULL); one list shared between hardlink twin + * set */ + GQueue *hardlinks; + + /* digest of this file updated on every hash iteration. Use a pointer so we can share + * with RmShredGroup + */ + RmDigest *digest; + + /* digest of this file read from file extended attributes (previously written by + * rmlint) + */ + const char *ext_cksum; + + /* multi-disk scheduler disk link */ + struct _RmMDSDevice *disk; + + /* file path as node of folder n-ary tree + * */ + RmNode *node; + + /* Link to the RmShredGroup that the file currently belongs to */ + struct RmShredGroup *shred_group; + + /* Required for rm_file_equal and for RM_DEFINE_PATH */ + const struct RmSession *session; + + struct RmSignal *signal; + + /* Parent directory. + * Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY. + */ + struct RmDirectory *parent_dir; + + /*----- 32-bit types ----- */ + + guint ref_count; + + /*----- 16-bit types ----- */ + /* Depth of the file, relative to the command-line path it was found under. */ gint16 depth; @@ -121,7 +203,20 @@ typedef struct RmFile { * */ gint16 outer_link_count; - struct _RmMDSDevice *disk; + + /* What kind of lint this file is. + */ + RmLintType lint_type; + + /* Caching bitmasks to ensure each file is only matched once + * for every GRegex combination. + * See also preprocess.c for more explanation. + * */ + RmPatternBitmask pattern_bitmask_path; + RmPatternBitmask pattern_bitmask_basename; + + + /*----- bitfield types ----- */ /* True if the file is a symlink * shredder needs to know this, since the metadata might be about the @@ -174,85 +269,6 @@ typedef struct RmFile { /* File hashing failed (probably read error or user interrupt) */ bool hashing_failed : 1; - /* The pre-matched file cluster that this file belongs to (or NULL) */ - GQueue *cluster; - - /* pointer to hardlinks collection (or NULL); one list shared between hardlink twin - * set */ - GQueue *hardlinks; - - /* The index of the path this file belongs to. */ - RmOff path_index; - - /* Filesize in bytes; this may be less than actual_file_size, - * since -q / -Q may limit this number. - */ - RmOff file_size; - - /* Filesize of a file when it was traversed by rmlint. - */ - RmOff actual_file_size; - - /* How many bytes were already read. - * (lower or equal file_size) - */ - RmOff hash_offset; - - /* digest of this file updated on every hash iteration. Use a pointer so we can share - * with RmShredGroup - */ - RmDigest *digest; - - /* digest of this file read from file extended attributes (previously written by - * rmlint) - */ - const char *ext_cksum; - - /* Those are never used at the same time. - * disk_offset is used during computation, - * twin_count during output. - */ - union { - /* Count of twins of this file. - * (i.e. length of group of this file) - */ - gint64 twin_count; - - /* Disk fiemap / physical offset at start of file (tests mapping subsequent - * file fragments did not deliver any significant additionl benefit) */ - RmOff disk_offset; - }; - - /* What kind of lint this file is. - */ - RmLintType lint_type; - - /* Link to the RmShredGroup that the file currently belongs to */ - struct RmShredGroup *shred_group; - - /* Required for rm_file_equal and for RM_DEFINE_PATH */ - const struct RmSession *session; - - struct RmSignal *signal; - - /* Caching bitmasks to ensure each file is only matched once - * for every GRegex combination. - * See also preprocess.c for more explanation. - * */ - RmPatternBitmask pattern_bitmask_path; - RmPatternBitmask pattern_bitmask_basename; - - /* Parent directory. - * Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY. - */ - struct RmDirectory *parent_dir; - - /* Number of children this file has. - * Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY. - * */ - size_t n_children; - - guint ref_count; } RmFile; /* Defines a path variable containing the file's path */ From a786f113172e52cb5773a00e44cd17cefcc4a5e9 Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 07:53:49 +1000 Subject: [PATCH 04/17] file: remove redundant path_id -> sizeof(RmFile)==160 --- lib/file.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/file.h b/lib/file.h index 7c750484..7122ad36 100644 --- a/lib/file.h +++ b/lib/file.h @@ -98,10 +98,6 @@ typedef struct RmFile { /*----- 64-bit types ----- */ - /* file path lookup ID (if using swap table) - * */ - RmOff path_id; - /* The index of the path this file belongs to. */ RmOff path_index; From 6d3514f57f24aa18cbe3f6fdebc62e7f9b587585 Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 07:55:28 +1000 Subject: [PATCH 05/17] file: only use 32 bits for n_children; sizeof(RmFile)==160 --- lib/file.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/file.h b/lib/file.h index 7122ad36..b28d51d6 100644 --- a/lib/file.h +++ b/lib/file.h @@ -134,11 +134,6 @@ typedef struct RmFile { * */ gdouble mtime; - /* Number of children this file has. - * Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY. - * */ - size_t n_children; - /*----- pointer types ----- */ @@ -183,6 +178,12 @@ typedef struct RmFile { guint ref_count; + /* Number of children this file has. + * Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY. + * */ + guint32 n_children; + + /*----- 16-bit types ----- */ /* Depth of the file, relative to the command-line path it was found under. From 50add6ab476218b22e9fc4804ccacb8fe1c3d131 Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 07:58:06 +1000 Subject: [PATCH 06/17] file: use 16 bits for path_index -> sizeof(RmFile)==152 --- lib/file.h | 6 +++--- lib/shredder.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/file.h b/lib/file.h index b28d51d6..33859791 100644 --- a/lib/file.h +++ b/lib/file.h @@ -98,9 +98,6 @@ typedef struct RmFile { /*----- 64-bit types ----- */ - /* The index of the path this file belongs to. */ - RmOff path_index; - /* Filesize in bytes; this may be less than actual_file_size, * since -q / -Q may limit this number. */ @@ -186,6 +183,9 @@ typedef struct RmFile { /*----- 16-bit types ----- */ + /* The index of the path this file belongs to. */ + guint16 path_index; + /* Depth of the file, relative to the command-line path it was found under. */ gint16 depth; diff --git a/lib/shredder.c b/lib/shredder.c index f5dd3db9..40a47ab8 100644 --- a/lib/shredder.c +++ b/lib/shredder.c @@ -1102,7 +1102,7 @@ static void rm_shred_file_preprocess(RmFile *file, RmShredGroup **group) { /* add reference for this file to the MDS scheduler, and get pointer to its device */ file->disk = rm_mds_device_get( session->mds, file_path, - (cfg->fake_pathindex_as_disk) ? file->path_index + 1 : rm_file_dev(file)); + (cfg->fake_pathindex_as_disk) ? (dev_t)file->path_index + 1 : rm_file_dev(file)); rm_mds_device_ref(file->disk, 1); rm_shred_adjust_counters(shredder, 1, (gint64)file->file_size - file->hash_offset); From e1057c92938caa465591fb00353e63c6a61fcc5e Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 08:00:11 +1000 Subject: [PATCH 07/17] file: only need 4 bits (just!) for RmLintType -> sizeof(RmFile)==144 --- lib/file.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/file.h b/lib/file.h index 33859791..7298d141 100644 --- a/lib/file.h +++ b/lib/file.h @@ -201,10 +201,6 @@ typedef struct RmFile { gint16 outer_link_count; - /* What kind of lint this file is. - */ - RmLintType lint_type; - /* Caching bitmasks to ensure each file is only matched once * for every GRegex combination. * See also preprocess.c for more explanation. @@ -215,6 +211,11 @@ typedef struct RmFile { /*----- bitfield types ----- */ + /* What kind of lint this file is. + */ + RmLintType lint_type : 4; + + /* True if the file is a symlink * shredder needs to know this, since the metadata might be about the * symlink file itself, while open() returns the pointed file. From d8762fa08ae273ae72a3acb0f9440b129e79fb9b Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 08:03:25 +1000 Subject: [PATCH 08/17] file: only use 16 bits for ref_count -> sizeof(RmFile)==144 breaks threadsafe-ness but should be ok since file unrefs are all single-threaded --- lib/file.c | 4 ++-- lib/file.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/file.c b/lib/file.c index 93fd6ffd..e790cc65 100644 --- a/lib/file.c +++ b/lib/file.c @@ -138,7 +138,7 @@ static gint rm_file_unref_impl(RmFile *file, gboolean unref_hardlinks) { if(!file) { return 0; } - if(!g_atomic_int_dec_and_test(&file->ref_count)) { + if(--file->ref_count != 0) { // somebody still loves me! return 0; } @@ -175,7 +175,7 @@ gint rm_file_unref_full(RmFile *file) { } RmFile *rm_file_ref(RmFile *file) { - g_atomic_int_inc (&file->ref_count); + file->ref_count++; return file; } diff --git a/lib/file.h b/lib/file.h index 7298d141..a4adee71 100644 --- a/lib/file.h +++ b/lib/file.h @@ -173,8 +173,6 @@ typedef struct RmFile { /*----- 32-bit types ----- */ - guint ref_count; - /* Number of children this file has. * Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY. * */ @@ -200,6 +198,8 @@ typedef struct RmFile { * */ gint16 outer_link_count; + guint16 ref_count; + /* Caching bitmasks to ensure each file is only matched once * for every GRegex combination. @@ -314,7 +314,7 @@ RmFile *rm_file_new(struct RmSession *session, const char *path, RmStat *statp, /** * @brief Decrease reference count to file; * free resources if this is the last reference. - * @note threadsafe + * @note NOT threadsafe * @note nullable * @retval 1 if the file freed else 0 */ From a575d6ef2646c1fac1abaa6df79efecaa88e02d9 Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 08:23:25 +1000 Subject: [PATCH 09/17] file: don't cache RmMDSDevice *disk reference -> sizeof(RmFile)==136 --- lib/file.h | 9 ++++++--- lib/shredder.c | 31 ++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/lib/file.h b/lib/file.h index a4adee71..92a715e0 100644 --- a/lib/file.h +++ b/lib/file.h @@ -151,9 +151,6 @@ typedef struct RmFile { */ const char *ext_cksum; - /* multi-disk scheduler disk link */ - struct _RmMDSDevice *disk; - /* file path as node of folder n-ary tree * */ RmNode *node; @@ -267,6 +264,12 @@ typedef struct RmFile { /* File hashing failed (probably read error or user interrupt) */ bool hashing_failed : 1; + /* is on a spinning disk medium */ + bool is_on_rotational_disk : 1; + + /* true if mds disk needs unref */ + bool has_disk_ref : 1; + } RmFile; /* Defines a path variable containing the file's path */ diff --git a/lib/shredder.c b/lib/shredder.c index 40a47ab8..afd6cba0 100644 --- a/lib/shredder.c +++ b/lib/shredder.c @@ -705,6 +705,13 @@ static void rm_shred_write_group_to_xattr(const RmSession *session, GQueue *grou } } +static RmMDSDevice *rm_shred_disk(RmFile *file, const RmSession *session) { + dev_t dev = (session->cfg->fake_pathindex_as_disk) ? + (dev_t)file->path_index + 1 : + rm_file_dev(file); + return rm_mds_device_get(session->mds, NULL, dev); +} + /* Unlink RmFile from Shredder */ static void rm_shred_discard_file(RmFile *file, _UNUSED gpointer user_data) { @@ -712,9 +719,9 @@ static void rm_shred_discard_file(RmFile *file, _UNUSED gpointer user_data) { RmShredTag *tag = session->shredder; /* update device counters (unless this file was a bundled hardlink) */ - if(file->disk) { - rm_mds_device_ref(file->disk, -1); - file->disk = NULL; + if(file->has_disk_ref) { + rm_mds_device_ref(rm_shred_disk(file, session), -1); + file->has_disk_ref = FALSE; rm_shred_adjust_counters(tag, -1, -(gint64)(file->file_size - file->hash_offset)); } @@ -736,7 +743,8 @@ static void rm_shred_push_queue(RmFile *file) { file->disk_offset = rm_file_inode(file); } } - rm_mds_push_task(file->disk, rm_file_dev(file), file->disk_offset, NULL, file); + rm_mds_push_task(rm_shred_disk(file, file->session), rm_file_dev(file), + file->disk_offset, NULL, file); } ////////////////////////////////// @@ -1099,11 +1107,11 @@ static void rm_shred_file_preprocess(RmFile *file, RmShredGroup **group) { RM_DEFINE_PATH(file); - /* add reference for this file to the MDS scheduler, and get pointer to its device */ - file->disk = rm_mds_device_get( - session->mds, file_path, - (cfg->fake_pathindex_as_disk) ? (dev_t)file->path_index + 1 : rm_file_dev(file)); - rm_mds_device_ref(file->disk, 1); + /* add reference for this file to the MDS scheduler */ + RmMDSDevice *device = rm_shred_disk(file, session); + file->is_on_rotational_disk = rm_mds_device_is_rotational(device); + rm_mds_device_ref(device, 1); + file->has_disk_ref = TRUE; rm_shred_adjust_counters(shredder, 1, (gint64)file->file_size - file->hash_offset); @@ -1665,7 +1673,7 @@ static gint rm_shred_process_file(RmFile *file, RmSession *session) { gboolean shredder_waiting = (file->shred_group->next_offset != file->file_size) && (cfg->shred_always_wait || - (!cfg->shred_never_wait && rm_mds_device_is_rotational(file->disk) && + (!cfg->shred_never_wait && file->is_on_rotational_disk && bytes_to_read < SHRED_TOO_MANY_BYTES_TO_WAIT)); gsize bytes_read = 0; @@ -1718,7 +1726,8 @@ static gint rm_shred_process_file(RmFile *file, RmSession *session) { } if(file) { /* file was not handled by rm_shred_sift so we need to add it back to the queue */ - rm_mds_push_task(file->disk, rm_file_dev(file), file->disk_offset, NULL, file); + rm_mds_push_task(rm_shred_disk(file, session), rm_file_dev(file), + file->disk_offset, NULL, file); } return result; } From 1fc48aba83159e9d0692614659c4d5c46565996a Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 08:27:51 +1000 Subject: [PATCH 10/17] file: remove redundant fadvise_requested flag -> sizeof(RmFile)==136 --- lib/file.h | 3 --- lib/shredder.c | 5 ----- 2 files changed, 8 deletions(-) diff --git a/lib/file.h b/lib/file.h index 92a715e0..2e929651 100644 --- a/lib/file.h +++ b/lib/file.h @@ -249,9 +249,6 @@ typedef struct RmFile { */ bool free_digest : 1; - /* If true, the file will be request to be pre-cached on the next read */ - bool fadvise_requested : 1; - /* Set to true if rm_shred_process_file() for hash increment */ bool shredder_waiting : 1; diff --git a/lib/shredder.c b/lib/shredder.c index afd6cba0..d4aebdc5 100644 --- a/lib/shredder.c +++ b/lib/shredder.c @@ -498,10 +498,6 @@ static gint32 rm_shred_get_read_size(RmFile *file, RmShredTag *tag) { g_assert(tag); RmOff balanced_bytes = tag->page_size * SHRED_BALANCED_PAGES; RmOff target_bytes = balanced_bytes * group->offset_factor; - if(group->next_offset == 2) { - file->fadvise_requested = 1; - } - /* round to even number of pages, round up to MIN_READ_PAGES */ RmOff target_pages = MAX(target_bytes / tag->page_size, 1); target_bytes = target_pages * tag->page_size; @@ -509,7 +505,6 @@ static gint32 rm_shred_get_read_size(RmFile *file, RmShredTag *tag) { /* test if cost-effective to read the whole file */ if(group->hash_offset + target_bytes + (balanced_bytes) >= group->file_size) { group->next_offset = group->file_size; - file->fadvise_requested = 1; } else { group->next_offset = group->hash_offset + target_bytes; } From 3a08bcd366b3279e61c4c5b32591b760b8ebcde7 Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 08:28:15 +1000 Subject: [PATCH 11/17] file: remove redundant free_digest flag -> sizeof(RmFile)==136 --- lib/file.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/file.h b/lib/file.h index 2e929651..60912f65 100644 --- a/lib/file.h +++ b/lib/file.h @@ -244,11 +244,6 @@ typedef struct RmFile { */ bool is_hidden : 1; - /* If false rm_file_destroy will not destroy the digest. This is useful - * for sharing the digest of duplicates in a group. - */ - bool free_digest : 1; - /* Set to true if rm_shred_process_file() for hash increment */ bool shredder_waiting : 1; From a0c329bc60565c5208c82c7eac9d281482b2a597 Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 09:00:12 +1000 Subject: [PATCH 12/17] rank: size ranking should use actual_file_size, not clamped size --- lib/rank.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rank.c b/lib/rank.c index a7a35e73..1dba299f 100644 --- a/lib/rank.c +++ b/lib/rank.c @@ -246,7 +246,7 @@ int rm_rank_orig_criteria(const RmFile *a, const RmFile *b, const RmSession *ses * duplicates by splitting whereever rm_rank_group(a, b) != 0 */ gint rm_rank_group(const RmFile *file_a, const RmFile *file_b) { - RETURN_IF_NONZERO(SIGN_DIFF(file_a->file_size, file_b->file_size)); + RETURN_IF_NONZERO(SIGN_DIFF(file_a->actual_file_size, file_b->actual_file_size)); RmCfg *cfg = file_a->session->cfg; From eb243ec2179a04543898137c760468d7951b62ae Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 09:00:44 +1000 Subject: [PATCH 13/17] shredder: test for zero-sized files should use actual file size --- lib/shredder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/shredder.c b/lib/shredder.c index d4aebdc5..65485f3c 100644 --- a/lib/shredder.c +++ b/lib/shredder.c @@ -1090,7 +1090,7 @@ static void rm_shred_file_preprocess(RmFile *file, RmShredGroup **group) { g_assert(file->lint_type == RM_LINT_TYPE_DUPE_CANDIDATE); /* Create an empty checksum for empty files */ - if(file->file_size == 0) { + if(file->actual_file_size == 0) { file->digest = rm_digest_new(cfg->checksum_type, 0); } From a9ac0f41b43808248cb3311dff447209d5a82adf Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Wed, 26 May 2021 09:02:15 +1000 Subject: [PATCH 14/17] file: don't cache [clamped] file_size -> sizeof(RmFile)==128 --- lib/file.c | 70 +++++++++++++++++++++++++------------------------ lib/file.h | 12 ++++----- lib/shredder.c | 11 ++++---- lib/treemerge.c | 3 +-- 4 files changed, 49 insertions(+), 47 deletions(-) diff --git a/lib/file.c b/lib/file.c index e790cc65..8c3098ed 100644 --- a/lib/file.c +++ b/lib/file.c @@ -28,53 +28,57 @@ #include "session.h" -RmFile *rm_file_new(struct RmSession *session, const char *path, RmStat *statp, - RmLintType type, bool is_ppath, unsigned path_index, short depth, - RmNode *node) { - RmCfg *cfg = session->cfg; - RmOff actual_file_size = statp->st_size; - RmOff start_seek = 0; - - /* Allow an actual file size of 0 for empty files */ - if(actual_file_size != 0) { - if(cfg->use_absolute_start_offset) { - start_seek = cfg->skip_start_offset; - if(cfg->skip_start_offset >= actual_file_size) { - return NULL; - } - } else { - start_seek = cfg->skip_start_factor * actual_file_size; - if((int)(actual_file_size * cfg->skip_end_factor) == 0) { - return NULL; - } - - if(start_seek >= actual_file_size) { - return NULL; - } - } +static RmOff rm_file_start_seek(RmFile *file) { + RmCfg *cfg = file->session->cfg; + + if(cfg->use_absolute_start_offset) { + return cfg->skip_start_offset; + } else { + return cfg->skip_start_factor * file->actual_file_size; } +} + +static RmOff rm_file_end_seek(RmFile *file) { + RmCfg *cfg = file->session->cfg; + RmOff end_seek = file->actual_file_size; - RmOff file_size; if(cfg->use_absolute_end_offset) { - file_size = CLAMP(actual_file_size, 1, cfg->skip_end_offset); + return end_seek - MIN(cfg->skip_end_offset, end_seek); } else { - file_size = actual_file_size * cfg->skip_end_factor; + return end_seek * cfg->skip_end_factor; } +} + +RmOff rm_file_clamped_size(RmFile *file) { + RmOff start_seek = rm_file_start_seek(file); + RmOff end_seek = rm_file_end_seek(file); + return end_seek - MIN(start_seek, end_seek); +} + + + +RmFile *rm_file_new(struct RmSession *session, const char *path, RmStat *statp, + RmLintType type, bool is_ppath, unsigned path_index, short depth, + RmNode *node) { + RmCfg *cfg = session->cfg; + RmFile *self = g_slice_new0(RmFile); + self->session = session; + self->actual_file_size = statp->st_size; if(type == RM_LINT_TYPE_DUPE_CANDIDATE || type == RM_LINT_TYPE_PART_OF_DIRECTORY) { /* Check if the actual slice the file will be > 0; we don't want empty files in * shredder */ - if((file_size - start_seek) == 0 && actual_file_size != 0) { + if(self->actual_file_size != 0 && rm_file_clamped_size(self) == 0) { + g_slice_free(RmFile, self); return NULL; } } else { // report other types as zero-size - actual_file_size = 0; + // TODO: review this, doesn't seem sensible + self->actual_file_size = 0; } - RmFile *self = g_slice_new0(RmFile); - self->session = session; if(!node) { node = rm_trie_insert(&cfg->file_trie, path, statp->st_dev, statp->st_ino); @@ -82,14 +86,12 @@ RmFile *rm_file_new(struct RmSession *session, const char *path, RmStat *statp, self->node = node; self->depth = depth; - self->file_size = file_size; - self->actual_file_size = actual_file_size; self->n_children = 0; self->mtime = rm_sys_stat_mtime_float(statp); self->is_new = (self->mtime >= cfg->min_mtime); - self->hash_offset = start_seek; + self->hash_offset = rm_file_start_seek(self); self->lint_type = type; self->is_prefd = is_ppath; diff --git a/lib/file.h b/lib/file.h index 60912f65..351cd66e 100644 --- a/lib/file.h +++ b/lib/file.h @@ -98,12 +98,7 @@ typedef struct RmFile { /*----- 64-bit types ----- */ - /* Filesize in bytes; this may be less than actual_file_size, - * since -q / -Q may limit this number. - */ - RmOff file_size; - - /* Filesize of a file when it was traversed by rmlint. + /* Filesize of a file according to stat when it was traversed by rmlint. */ RmOff actual_file_size; @@ -406,4 +401,9 @@ static inline dev_t rm_file_parent_dev(const RmFile *file) { return rm_node_get_dev(file->node->parent); } +/** + * @brief file size after clamping start and end offsets. + */ +RmOff rm_file_clamped_size(RmFile *file); + #endif /* end of include guard */ diff --git a/lib/shredder.c b/lib/shredder.c index 65485f3c..0edeed1b 100644 --- a/lib/shredder.c +++ b/lib/shredder.c @@ -471,7 +471,7 @@ static RmShredGroup *rm_shred_group_new(RmFile *file) { } self->held_files = g_queue_new(); - self->file_size = file->file_size; + self->file_size = rm_file_clamped_size(file); self->hash_offset = file->hash_offset; self->session = file->session; @@ -717,7 +717,8 @@ static void rm_shred_discard_file(RmFile *file, _UNUSED gpointer user_data) { if(file->has_disk_ref) { rm_mds_device_ref(rm_shred_disk(file, session), -1); file->has_disk_ref = FALSE; - rm_shred_adjust_counters(tag, -1, -(gint64)(file->file_size - file->hash_offset)); + rm_shred_adjust_counters(tag, -1, + -(gint64)(rm_file_clamped_size(file) - file->hash_offset)); } /* toss the file (and any embedded hardlinks)*/ @@ -1108,7 +1109,7 @@ static void rm_shred_file_preprocess(RmFile *file, RmShredGroup **group) { rm_mds_device_ref(device, 1); file->has_disk_ref = TRUE; - rm_shred_adjust_counters(shredder, 1, (gint64)file->file_size - file->hash_offset); + rm_shred_adjust_counters(shredder, 1, rm_file_clamped_size(file) - file->hash_offset); rm_shred_group_push_file(*group, file, true); } @@ -1666,7 +1667,7 @@ static gint rm_shred_process_file(RmFile *file, RmSession *session) { RmOff bytes_to_read = rm_shred_get_read_size(file, tag); gboolean shredder_waiting = - (file->shred_group->next_offset != file->file_size) && + (file->shred_group->next_offset != rm_file_clamped_size(file)) && (cfg->shred_always_wait || (!cfg->shred_never_wait && file->is_on_rotational_disk && bytes_to_read < SHRED_TOO_MANY_BYTES_TO_WAIT)); @@ -1686,7 +1687,7 @@ static gint rm_shred_process_file(RmFile *file, RmSession *session) { /* Update totals for file, device and session*/ file->hash_offset += bytes_to_read; if(file->is_symlink) { - rm_shred_adjust_counters(tag, 0, -(gint64)file->file_size); + rm_shred_adjust_counters(tag, 0, -rm_file_clamped_size(file)); } else { rm_shred_adjust_counters(tag, 0, -(gint64)bytes_to_read); } diff --git a/lib/treemerge.c b/lib/treemerge.c index ed7de6be..f1e20592 100644 --- a/lib/treemerge.c +++ b/lib/treemerge.c @@ -378,8 +378,7 @@ static void rm_directory_to_file(RmTreeMerger *merger, const RmDirectory *self, file->depth = rm_util_path_depth(self->dirname); /* Recursively calculate the file size */ - file->file_size = rm_tm_calc_file_size(self); - file->actual_file_size = file->file_size; + file->actual_file_size = rm_tm_calc_file_size(self); file->is_prefd = (self->prefd_files >= self->dupe_count); file->parent_dir = (RmDirectory *)self; file->n_children = self->dupe_count; From c4e2d12d555db0f04b820de2ad717d2fe03952ef Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Fri, 28 May 2021 16:50:47 +1000 Subject: [PATCH 15/17] Revert "file: only use 16 bits for ref_count -> sizeof(RmFile)==144" This reverts commit d8762fa08ae273ae72a3acb0f9440b129e79fb9b. --- lib/file.c | 4 ++-- lib/file.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/file.c b/lib/file.c index 8c3098ed..bc0a974d 100644 --- a/lib/file.c +++ b/lib/file.c @@ -140,7 +140,7 @@ static gint rm_file_unref_impl(RmFile *file, gboolean unref_hardlinks) { if(!file) { return 0; } - if(--file->ref_count != 0) { + if(!g_atomic_int_dec_and_test(&file->ref_count)) { // somebody still loves me! return 0; } @@ -177,7 +177,7 @@ gint rm_file_unref_full(RmFile *file) { } RmFile *rm_file_ref(RmFile *file) { - file->ref_count++; + g_atomic_int_inc (&file->ref_count); return file; } diff --git a/lib/file.h b/lib/file.h index 351cd66e..07bc6177 100644 --- a/lib/file.h +++ b/lib/file.h @@ -165,6 +165,8 @@ typedef struct RmFile { /*----- 32-bit types ----- */ + guint ref_count; + /* Number of children this file has. * Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY. * */ @@ -190,8 +192,6 @@ typedef struct RmFile { * */ gint16 outer_link_count; - guint16 ref_count; - /* Caching bitmasks to ensure each file is only matched once * for every GRegex combination. @@ -304,7 +304,7 @@ RmFile *rm_file_new(struct RmSession *session, const char *path, RmStat *statp, /** * @brief Decrease reference count to file; * free resources if this is the last reference. - * @note NOT threadsafe + * @note threadsafe * @note nullable * @retval 1 if the file freed else 0 */ From e43972ca7812a12d5376ee8f8e929a30cdaa2887 Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Fri, 28 May 2021 16:51:11 +1000 Subject: [PATCH 16/17] Revert "print RmFile size (size==192)" This reverts commit e5c4f61a622d84c5346d0fec1ed788a16f4d4a01. --- lib/session.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/session.c b/lib/session.c index edc3b5d7..45bf41e5 100644 --- a/lib/session.c +++ b/lib/session.c @@ -76,7 +76,6 @@ bool rm_session_check_kernel_version(int need_major, int need_minor) { } void rm_session_init(RmSession *session, RmCfg *cfg) { - rm_log_warning("Sizeof(RmFile): %lu\n", sizeof(RmFile)); memset(session, 0, sizeof(RmSession)); session->timer = g_timer_new(); From 0032f5e769d02a7a088be5bee17c630153a72c0f Mon Sep 17 00:00:00 2001 From: SeeSpotRun Date: Fri, 28 May 2021 17:49:07 +1000 Subject: [PATCH 17/17] file: fix clamping --- lib/file.c | 8 ++++---- lib/file.h | 5 +++++ lib/shredder.c | 8 ++++---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/lib/file.c b/lib/file.c index bc0a974d..86d8a7eb 100644 --- a/lib/file.c +++ b/lib/file.c @@ -38,14 +38,14 @@ static RmOff rm_file_start_seek(RmFile *file) { } } -static RmOff rm_file_end_seek(RmFile *file) { +RmOff rm_file_end_seek(RmFile *file) { RmCfg *cfg = file->session->cfg; - RmOff end_seek = file->actual_file_size; + RmOff file_size = file->actual_file_size; if(cfg->use_absolute_end_offset) { - return end_seek - MIN(cfg->skip_end_offset, end_seek); + return MIN(cfg->skip_end_offset, file_size); } else { - return end_seek * cfg->skip_end_factor; + return MIN(file_size, file_size * cfg->skip_end_factor); } } diff --git a/lib/file.h b/lib/file.h index 07bc6177..2c9d9273 100644 --- a/lib/file.h +++ b/lib/file.h @@ -406,4 +406,9 @@ static inline dev_t rm_file_parent_dev(const RmFile *file) { */ RmOff rm_file_clamped_size(RmFile *file); +/** + * @brief file end position after clamping end offset. + */ +RmOff rm_file_end_seek(RmFile *file); + #endif /* end of include guard */ diff --git a/lib/shredder.c b/lib/shredder.c index 0edeed1b..e773fedd 100644 --- a/lib/shredder.c +++ b/lib/shredder.c @@ -471,7 +471,7 @@ static RmShredGroup *rm_shred_group_new(RmFile *file) { } self->held_files = g_queue_new(); - self->file_size = rm_file_clamped_size(file); + self->file_size = rm_file_end_seek(file); self->hash_offset = file->hash_offset; self->session = file->session; @@ -718,7 +718,7 @@ static void rm_shred_discard_file(RmFile *file, _UNUSED gpointer user_data) { rm_mds_device_ref(rm_shred_disk(file, session), -1); file->has_disk_ref = FALSE; rm_shred_adjust_counters(tag, -1, - -(gint64)(rm_file_clamped_size(file) - file->hash_offset)); + -(gint64)(rm_file_end_seek(file) - file->hash_offset)); } /* toss the file (and any embedded hardlinks)*/ @@ -1109,7 +1109,7 @@ static void rm_shred_file_preprocess(RmFile *file, RmShredGroup **group) { rm_mds_device_ref(device, 1); file->has_disk_ref = TRUE; - rm_shred_adjust_counters(shredder, 1, rm_file_clamped_size(file) - file->hash_offset); + rm_shred_adjust_counters(shredder, 1, rm_file_end_seek(file) - file->hash_offset); rm_shred_group_push_file(*group, file, true); } @@ -1667,7 +1667,7 @@ static gint rm_shred_process_file(RmFile *file, RmSession *session) { RmOff bytes_to_read = rm_shred_get_read_size(file, tag); gboolean shredder_waiting = - (file->shred_group->next_offset != rm_file_clamped_size(file)) && + (file->shred_group->next_offset != rm_file_end_seek(file)) && (cfg->shred_always_wait || (!cfg->shred_never_wait && file->is_on_rotational_disk && bytes_to_read < SHRED_TOO_MANY_BYTES_TO_WAIT));