Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Maximum and minimum file size #534

Open
wants to merge 6 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Ack.pm
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,10 @@ File inclusion/exclusion:
filetype.
--type=noX Exclude X files.
See "ack --help-types" for supported filetypes.
--max-file-size=NUM, --max-size=NUM
Excludes files larger than this size (in bytes)
--min-file-size=NUM, --min-size=NUM
Excludes files smaller than this size (in bytes)

File type specification:
--type-set TYPE:FILTER:FILTERARGS
Expand Down
5 changes: 5 additions & 0 deletions ConfigLoader.pm
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use App::Ack ();
use App::Ack::ConfigDefault ();
use App::Ack::ConfigFinder ();
use App::Ack::Filter;
use App::Ack::Filter::Size;
use App::Ack::Filter::Default;
use Carp 1.04 ();
use Getopt::Long 2.35 ();
Expand Down Expand Up @@ -332,6 +333,10 @@ EOT
=> \$opt->{L},
'm|max-count=i' => \$opt->{m},
'match=s' => \$opt->{regex},
'max-size|max-file-size=s'
=> sub { $opt->{max_file_size} = App::Ack::Filter::Size::_parse_size ($_[1]) },
'min-size|min-file-size=s'
=> sub { $opt->{min_file_size} = App::Ack::Filter::Size::_parse_size ($_[1]) },
'n|no-recurse' => \$opt->{n},
o => sub { $opt->{output} = '$&' },
'output=s' => \$opt->{output},
Expand Down
3 changes: 3 additions & 0 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Match.pm
MatchGroup.pm
Resource.pm
Resources.pm
Size.pm

record-options
squash
Expand Down Expand Up @@ -69,6 +70,7 @@ t/ack-print0.t
t/ack-removed-options.t
t/ack-show-types.t
t/ack-s.t
t/ack-size.pl
t/ack-type-del.t
t/ack-type.t
t/ack-v.t
Expand Down Expand Up @@ -141,6 +143,7 @@ t/lib/Match.t
t/lib/MatchGroup.t
t/lib/Resource.t
t/lib/Resources.t
t/lib/Size.t

t/home/.ackrc
t/swamp/#emacs-workfile.pl#
Expand Down
3 changes: 2 additions & 1 deletion Makefile.PL
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ my %parms = (
'ConfigLoader.pm' => '$(INST_LIBDIR)/App/Ack/ConfigLoader.pm',
'Filter.pm' => '$(INST_LIBDIR)/App/Ack/Filter.pm',
'Extension.pm' => '$(INST_LIBDIR)/App/Ack/Filter/Extension.pm',
'Size.pm' => '$(INST_LIBDIR)/App/Ack/Filter/Size.pm',
'FirstLineMatch.pm' => '$(INST_LIBDIR)/App/Ack/Filter/FirstLineMatch.pm',
'Is.pm' => '$(INST_LIBDIR)/App/Ack/Filter/Is.pm',
'Match.pm' => '$(INST_LIBDIR)/App/Ack/Filter/Match.pm',
Expand Down Expand Up @@ -95,7 +96,7 @@ ALL_PM = \
Ack.pm \
Resource.pm Resources.pm Basic.pm \
ConfigDefault.pm ConfigFinder.pm ConfigLoader.pm \
Filter.pm Extension.pm FirstLineMatch.pm Is.pm Match.pm Default.pm Inverse.pm Collection.pm IsGroup.pm ExtensionGroup.pm MatchGroup.pm IsPath.pm IsPathGroup.pm
Filter.pm Extension.pm FirstLineMatch.pm Is.pm Match.pm Default.pm Inverse.pm Collection.pm IsGroup.pm ExtensionGroup.pm MatchGroup.pm IsPath.pm IsPathGroup.pm Size.pm

TEST_VERBOSE=0
TEST_FILES=t/*.t t/lib/*.t
Expand Down
68 changes: 68 additions & 0 deletions Size.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package App::Ack::Filter::Size;

use strict;
use warnings;
use base 'App::Ack::Filter';

use Carp 'croak';

use App::Ack::Filter ();

sub _parse_size {
my $s = $_[0] || return 0;

if ( $s =~ m/^\s*(\d+(?:\.\d+)?)(?:\s*([KMGT]?)B?)?\s*$/i ) {
my $n = $1;
if ($2) {
my $u = lc $2;
$n *= 1024 while $u =~ tr/tgmk/gmk/d;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is pretty clever, but it might be too clever; maybe we should just have a hash of suffixes => number of bytes?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And are we going to distinguish between MB and MiB, GB and GiB?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I keep being surprised at how decimal things are nowadays. Looks like things like ls -l defaults to 1K=1000 so I guess yes, we should.

}
return int $n;
}
else {
Carp::croak('Invalid size');
}
}

sub new {
my ( $class, $min, $max ) = @_;
return bless {
min => $min,
max => $max,
}, $class;
}

sub filter {
my ( $self, $resource ) = @_;

my $min = $self->{'min'} || 0;
my $max = $self->{'max'};

my $file = $resource->name;

return 1 if $file eq '-';

my $size = (-s $file) || 0; # paranoid?
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is going to incur an extra stat() call for every file we check; maybe we should store the size in the $resource.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, drat, looks like a commit went missing from the PR, which was to remove ine 43 (which is redundant) and use -s _


return 0 if $max and $size > $max;
return $size >= $min;
}

sub inspect {
my ( $self ) = @_;

my $min = $self->{'min'} || 0;
my $max = $self->{'max'} || '*';

return ref($self) . " - $min..$max";
}

sub to_string {
shift->inspect;
}

BEGIN {
App::Ack::Filter->register_filter(size => __PACKAGE__);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if it makes sense to register this filter; registered filters are used for --type-add and friends. Do we want users to be able to say --type-add=big-file:size:0,1000000?

If we're not going to use this as a registered filter, I don't know if it makes sense to implement this feature using a filter class; users using the feature are going to incur extra overhead from a method call, which really adds up in large codebases.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I was sticking to the form of the existing code without really understanding the workings, and beginning to come to the same conclusion. In practice, I don't particularly see size-based types being defined. I'm happy to just stick the logic straight into the ack script itself.

}

1;
26 changes: 26 additions & 0 deletions ack
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use App::Ack::Resource::Basic ();
use App::Ack::Filter ();
use App::Ack::Filter::Default;
use App::Ack::Filter::Extension;
use App::Ack::Filter::Size;
use App::Ack::Filter::FirstLineMatch;
use App::Ack::Filter::Inverse;
use App::Ack::Filter::Is;
Expand Down Expand Up @@ -146,6 +147,14 @@ sub _compile_file_filter {
}
}

# For the usual case where the user has not set this, it is faster if we can
# we can reduce to a single boolean test before we even make the method call
# if both of min and max are 0, don't test, accept all files

my $size_filter = ( $opt->{min_file_size} || $opt->{max_file_size} )
? App::Ack::Filter::Size->new($opt->{min_file_size}, $opt->{max_file_size})
: 0;

my %is_member_of_starting_set = map { (get_file_id($_) => 1) } @{$start};

my @ignore_dir_filter = @{$opt->{idirs} || []};
Expand Down Expand Up @@ -227,6 +236,8 @@ sub _compile_file_filter {
return 0;
}

return 0 if $size_filter && ! $size_filter->filter($resource);

my $match_found = $direct_filters->filter($resource);

# Don't bother invoking inverse filters unless we consider the current resource a match
Expand Down Expand Up @@ -1410,6 +1421,21 @@ Print this manual page.

No descending into subdirectories.

=item B<--max-file-size=I<NUM>>, B<--max-size=I<NUM>>

The maximum size of files C<ack> is willing to search.

This is useful for when you know you have a handful of extremely large files
which you do not need to search, but whose distinguishing feature is their size.

If not set, or set to 0, then there is no maximum.

=item B<--min-file-size=I<NUM>>, B<--min-size=I<NUM>>

The minimum size of files C<ack> is willing to search.

If not set, or set to 0, then there is no maximum.

=item B<-o>

Show only the part of each line matching PATTERN (turns off text
Expand Down
4 changes: 4 additions & 0 deletions t/Util.pm
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,10 @@ sub get_options {
'--man',
'--match',
'--max-count',
'--max-file-size',
'--max-size',
'--min-file-size',
'--min-size',
'--no-filename',
'--no-recurse',
'--nobreak',
Expand Down
40 changes: 40 additions & 0 deletions t/ack-size.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!perl -T

use strict;
use warnings;

use Test::More tests => 12;
use lib 't';
use Util;

prep_environment();

my ( $stdout, $stderr );
my $help_types_output;

# sanity check
( $stdout, $stderr ) = run_ack_with_stderr('--perl', '-f', 't/swamp');
is( scalar(@{$stdout}), 11, 'Found initial 11 files' );
is_empty_array( $stderr, 'Nothing in stderr' );

( $stdout, $stderr ) = run_ack_with_stderr('--perl', '--max-file-size=0', '-f', 't/swamp');
is( scalar(@{$stdout}), 11, 'Found initial 11 files (max of 0 has no effect)' );
is_empty_array( $stderr, 'Nothing in stderr' );

( $stdout, $stderr ) = run_ack_with_stderr('--perl', '--max-file-size=100', '-f', 't/swamp');
is( scalar(@{$stdout}), 3, 'Found 3 files <= 100 bytes large' );
is_empty_array( $stderr, 'Nothing in stderr' );

( $stdout, $stderr ) = run_ack_with_stderr('--perl', '--max-file-size=101', '-f', 't/swamp');
is( scalar(@{$stdout}), 3, 'Found 8 files >= 101 bytes large' );
is_empty_array( $stderr, 'Nothing in stderr' );

( $stdout, $stderr ) = run_ack_with_stderr('--perl', '--min-file-size=101', '--max-file-size=150', '-f', 't/swamp');
is( scalar(@{$stdout}), 1, 'Found 1 file where 101 <= size <= 150' );
is_empty_array( $stderr, 'Nothing in stderr' );

( $stdout, $stderr ) = run_ack_with_stderr('--perl', '--max-file-size=100', '--min-file-size=101', '-f', 't/swamp');
is( scalar(@{$stdout}), 0, 'Found no files when max and min conflict' );
is_empty_array( $stderr, 'Nothing in stderr' );

# done testing
28 changes: 28 additions & 0 deletions t/config-loader.t
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,34 @@ test_loader(
'--before-context should set before_context'
);

test_loader(
argv => ['--max-size=1500'],
expected_opts => { %defaults, max_file_size => 1500 },
expected_targets => [],
'--max-size should set max_file_size'
);

test_loader(
argv => ['--max-file-size=1500'],
expected_opts => { %defaults, max_file_size => 1500 },
expected_targets => [],
'--max-file-size should set max_file_size'
);

test_loader(
argv => ['--min-size=1500'],
expected_opts => { %defaults, min_file_size => 1500 },
expected_targets => [],
'--min-size should set min_file_size'
);

test_loader(
argv => ['--min-file-size=1500'],
expected_opts => { %defaults, min_file_size => 1500 },
expected_targets => [],
'--min-file-size should set min_file_size'
);

# XXX These tests should all be replicated to work off of the ack command line
# tools instead of its internal APIs!
do {
Expand Down
12 changes: 12 additions & 0 deletions t/lib/Size.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!perl -T

use strict;
use warnings;

use Test::More tests => 1;

use App::Ack::Filter::Size;

pass( 'App::Ack::Filter::Size loaded with nothing else loaded first' );

done_testing();