Skip to content

Commit

Permalink
Merge pull request #511 from SeeSpotRun/packing
Browse files Browse the repository at this point in the history
Packing
  • Loading branch information
SeeSpotRun authored May 28, 2021
2 parents 1a142fc + 0032f5e commit da0577b
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 170 deletions.
70 changes: 36 additions & 34 deletions lib/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,68 +28,70 @@

#include "session.h"

RmFile *rm_file_new(struct RmSession *session, const char *path, RmStat *statp,
RmLintType type, bool is_ppath, unsigned path_index, short depth,
RmNode *node) {
RmCfg *cfg = session->cfg;
RmOff actual_file_size = statp->st_size;
RmOff start_seek = 0;

/* Allow an actual file size of 0 for empty files */
if(actual_file_size != 0) {
if(cfg->use_absolute_start_offset) {
start_seek = cfg->skip_start_offset;
if(cfg->skip_start_offset >= actual_file_size) {
return NULL;
}
} else {
start_seek = cfg->skip_start_factor * actual_file_size;
if((int)(actual_file_size * cfg->skip_end_factor) == 0) {
return NULL;
}

if(start_seek >= actual_file_size) {
return NULL;
}
}
static RmOff rm_file_start_seek(RmFile *file) {
RmCfg *cfg = file->session->cfg;

if(cfg->use_absolute_start_offset) {
return cfg->skip_start_offset;
} else {
return cfg->skip_start_factor * file->actual_file_size;
}
}

RmOff rm_file_end_seek(RmFile *file) {
RmCfg *cfg = file->session->cfg;
RmOff file_size = file->actual_file_size;

RmOff file_size;
if(cfg->use_absolute_end_offset) {
file_size = CLAMP(actual_file_size, 1, cfg->skip_end_offset);
return MIN(cfg->skip_end_offset, file_size);
} else {
file_size = actual_file_size * cfg->skip_end_factor;
return MIN(file_size, file_size * cfg->skip_end_factor);
}
}

RmOff rm_file_clamped_size(RmFile *file) {
RmOff start_seek = rm_file_start_seek(file);
RmOff end_seek = rm_file_end_seek(file);
return end_seek - MIN(start_seek, end_seek);
}



RmFile *rm_file_new(struct RmSession *session, const char *path, RmStat *statp,
RmLintType type, bool is_ppath, unsigned path_index, short depth,
RmNode *node) {
RmCfg *cfg = session->cfg;
RmFile *self = g_slice_new0(RmFile);
self->session = session;
self->actual_file_size = statp->st_size;

if(type == RM_LINT_TYPE_DUPE_CANDIDATE || type == RM_LINT_TYPE_PART_OF_DIRECTORY) {
/* Check if the actual slice the file will be > 0; we don't want empty files in
* shredder */
if((file_size - start_seek) == 0 && actual_file_size != 0) {
if(self->actual_file_size != 0 && rm_file_clamped_size(self) == 0) {
g_slice_free(RmFile, self);
return NULL;
}
}
else {
// report other types as zero-size
actual_file_size = 0;
// TODO: review this, doesn't seem sensible
self->actual_file_size = 0;
}

RmFile *self = g_slice_new0(RmFile);
self->session = session;

if(!node) {
node = rm_trie_insert(&cfg->file_trie, path, statp->st_dev, statp->st_ino);
}
self->node = node;

self->depth = depth;
self->file_size = file_size;
self->actual_file_size = actual_file_size;
self->n_children = 0;

self->mtime = rm_sys_stat_mtime_float(statp);
self->is_new = (self->mtime >= cfg->min_mtime);

self->hash_offset = start_seek;
self->hash_offset = rm_file_start_seek(self);

self->lint_type = type;
self->is_prefd = is_ppath;
Expand Down
211 changes: 106 additions & 105 deletions lib/file.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,6 @@

#include "cfg.h"

typedef enum RmFileState {
/* File still processing
*/
RM_FILE_STATE_NORMAL,

/* File can be ignored, has a unique hash, gets read failure
* or is elsewhise not noteworthy.
*/
RM_FILE_STATE_IGNORE,

} RmFileState;

/* types of lint */
typedef enum RmLintType {
RM_LINT_TYPE_UNKNOWN = 0,
Expand Down Expand Up @@ -107,17 +95,88 @@ struct RmDirectory;
* RmFile structure; used by pretty much all rmlint modules.
*/
typedef struct RmFile {
/* file path lookup ID (if using swap table)

/*----- 64-bit types ----- */

/* Filesize of a file according to stat when it was traversed by rmlint.
*/
RmOff actual_file_size;

/* How many bytes were already read.
* (lower or equal file_size)
*/
RmOff hash_offset;

/* Those are never used at the same time.
* disk_offset is used during computation,
* twin_count during output.
*/
union {
/* Count of twins of this file.
* (i.e. length of group of this file)
*/
gint64 twin_count;

/* Disk fiemap / physical offset at start of file (tests mapping subsequent
* file fragments did not deliver any significant additionl benefit) */
RmOff disk_offset;
};

/* File modification date/time
* */
RmOff path_id;
gdouble mtime;


/*----- pointer types ----- */

/* The pre-matched file cluster that this file belongs to (or NULL) */
GQueue *cluster;

/* pointer to hardlinks collection (or NULL); one list shared between hardlink twin
* set */
GQueue *hardlinks;

/* digest of this file updated on every hash iteration. Use a pointer so we can share
* with RmShredGroup
*/
RmDigest *digest;

/* digest of this file read from file extended attributes (previously written by
* rmlint)
*/
const char *ext_cksum;

/* file path as node of folder n-ary tree
* */
RmNode *node;

/* File modification date/time
/* Link to the RmShredGroup that the file currently belongs to */
struct RmShredGroup *shred_group;

/* Required for rm_file_equal and for RM_DEFINE_PATH */
const struct RmSession *session;

struct RmSignal *signal;

/* Parent directory.
* Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY.
*/
struct RmDirectory *parent_dir;

/*----- 32-bit types ----- */

guint ref_count;

/* Number of children this file has.
* Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY.
* */
gdouble mtime;
guint32 n_children;


/*----- 16-bit types ----- */

/* The index of the path this file belongs to. */
guint16 path_index;

/* Depth of the file, relative to the command-line path it was found under.
*/
Expand All @@ -133,7 +192,21 @@ typedef struct RmFile {
* */
gint16 outer_link_count;

struct _RmMDSDevice *disk;

/* Caching bitmasks to ensure each file is only matched once
* for every GRegex combination.
* See also preprocess.c for more explanation.
* */
RmPatternBitmask pattern_bitmask_path;
RmPatternBitmask pattern_bitmask_basename;


/*----- bitfield types ----- */

/* What kind of lint this file is.
*/
RmLintType lint_type : 4;


/* True if the file is a symlink
* shredder needs to know this, since the metadata might be about the
Expand Down Expand Up @@ -166,14 +239,6 @@ typedef struct RmFile {
*/
bool is_hidden : 1;

/* If false rm_file_destroy will not destroy the digest. This is useful
* for sharing the digest of duplicates in a group.
*/
bool free_digest : 1;

/* If true, the file will be request to be pre-cached on the next read */
bool fadvise_requested : 1;

/* Set to true if rm_shred_process_file() for hash increment */
bool shredder_waiting : 1;

Expand All @@ -183,89 +248,15 @@ typedef struct RmFile {
/* Set to true if was read from [json] cache as an original */
bool cached_original : 1;

/* The pre-matched file cluster that this file belongs to (or NULL) */
GQueue *cluster;

/* pointer to hardlinks collection (or NULL); one list shared between hardlink twin
* set */
GQueue *hardlinks;

/* The index of the path this file belongs to. */
RmOff path_index;

/* Filesize in bytes; this may be less than actual_file_size,
* since -q / -Q may limit this number.
*/
RmOff file_size;

/* Filesize of a file when it was traversed by rmlint.
*/
RmOff actual_file_size;

/* How many bytes were already read.
* (lower or equal file_size)
*/
RmOff hash_offset;
/* File hashing failed (probably read error or user interrupt) */
bool hashing_failed : 1;

/* Flag for when we do intermediate steps within a hash increment because the file is
* fragmented */
RmFileState status;
/* is on a spinning disk medium */
bool is_on_rotational_disk : 1;

/* digest of this file updated on every hash iteration. Use a pointer so we can share
* with RmShredGroup
*/
RmDigest *digest;
/* true if mds disk needs unref */
bool has_disk_ref : 1;

/* digest of this file read from file extended attributes (previously written by
* rmlint)
*/
const char *ext_cksum;

/* Those are never used at the same time.
* disk_offset is used during computation,
* twin_count during output.
*/
union {
/* Count of twins of this file.
* (i.e. length of group of this file)
*/
gint64 twin_count;

/* Disk fiemap / physical offset at start of file (tests mapping subsequent
* file fragments did not deliver any significant additionl benefit) */
RmOff disk_offset;
};

/* What kind of lint this file is.
*/
RmLintType lint_type;

/* Link to the RmShredGroup that the file currently belongs to */
struct RmShredGroup *shred_group;

/* Required for rm_file_equal and for RM_DEFINE_PATH */
const struct RmSession *session;

struct RmSignal *signal;

/* Caching bitmasks to ensure each file is only matched once
* for every GRegex combination.
* See also preprocess.c for more explanation.
* */
RmPatternBitmask pattern_bitmask_path;
RmPatternBitmask pattern_bitmask_basename;

/* Parent directory.
* Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY.
*/
struct RmDirectory *parent_dir;

/* Number of children this file has.
* Only filled if type is RM_LINT_TYPE_PART_OF_DIRECTORY.
* */
size_t n_children;

guint ref_count;
} RmFile;

/* Defines a path variable containing the file's path */
Expand Down Expand Up @@ -410,4 +401,14 @@ static inline dev_t rm_file_parent_dev(const RmFile *file) {
return rm_node_get_dev(file->node->parent);
}

/**
* @brief file size after clamping start and end offsets.
*/
RmOff rm_file_clamped_size(RmFile *file);

/**
* @brief file end position after clamping end offset.
*/
RmOff rm_file_end_seek(RmFile *file);

#endif /* end of include guard */
2 changes: 1 addition & 1 deletion lib/rank.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ int rm_rank_orig_criteria(const RmFile *a, const RmFile *b, const RmSession *ses
* duplicates by splitting whereever rm_rank_group(a, b) != 0 */
gint rm_rank_group(const RmFile *file_a, const RmFile *file_b) {

RETURN_IF_NONZERO(SIGN_DIFF(file_a->file_size, file_b->file_size));
RETURN_IF_NONZERO(SIGN_DIFF(file_a->actual_file_size, file_b->actual_file_size));

RmCfg *cfg = file_a->session->cfg;

Expand Down
Loading

0 comments on commit da0577b

Please sign in to comment.