Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Glob #161

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open

Glob #161

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions hdfs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .client import Client, InsecureClient, TokenClient
from .config import Config, NullHandler
from .util import HdfsError
from .glob import glob, iglob
import logging as lg


Expand Down
117 changes: 117 additions & 0 deletions hdfs/glob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import fnmatch
import re
import posixpath


def glob(client, hdfs_path):
"""Return a list of paths matching a pathname pattern.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good doc!


The pattern may contain simple shell-style wildcards a la
fnmatch. However, unlike fnmatch, filenames starting with a
dot are special cases that are not matched by '*' and '?'
patterns.

:param client: Instance of :class:`Client`.
:param hdfs_path: HDFS path. May contain special characters like '*', '?' and '['.

Sample usages:

.. code-block:: python

glob(client, './foo/bar/*')
glob(client, './foo/bar/file[0-9].txt')
glob(client, './foo/bar/file?.txt')

"""
return list(iglob(client, hdfs_path))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

glob provides little value over iglob: it saves just the list call. What do you think about only exposing iglob, possibly renaming it to glob?



def iglob(client, hdfs_path):
"""Return an iterator which yields the paths matching a pathname pattern.

The pattern may contain simple shell-style wildcards a la
fnmatch. However, unlike fnmatch, filenames starting with a
dot are special cases that are not matched by '*' and '?'
patterns.

:param client: Instance of :class:`Client`.
:param hdfs_path: HDFS path. May contain special characters like '*', '?' and '['.

Sample usages:

.. code-block:: python

iglob(client, './foo/bar/*')
iglob(client, './foo/bar/file[0-9].txt')
iglob(client, './foo/bar/file?.txt')

"""
dirname, basename = posixpath.split(hdfs_path)
if not _has_magic(hdfs_path):
if basename:
if client.status(hdfs_path, strict=False):
yield hdfs_path
else:
# Patterns ending with a slash should match only directories
if client.status(dirname)['type'] == 'DIRECTORY':
yield hdfs_path
return
if not dirname:
for p in _glob1(client, None, basename):
yield p
return
# `os.path.split()` returns the argument itself as a dirname if it is a
# drive or UNC path. Prevent an infinite recursion if a drive or UNC path
# contains magic characters (i.e. r'\\?\C:').
if dirname != hdfs_path and _has_magic(dirname):
dirs = iglob(client, dirname)
else:
dirs = [dirname]
if _has_magic(basename):
glob_in_dir = _glob1
else:
glob_in_dir = _glob0
for dirname in dirs:
for name in glob_in_dir(client, dirname, basename):
yield posixpath.join(dirname, name)


def _glob1(client, dirname, pattern):
if not dirname:
if isinstance(pattern, bytes):
dirname = bytes(client.resolve('.'))
else:
dirname = client.resolve('.')
names = client.list(dirname)
if not _ishidden(pattern):
names = [x for x in names if not _ishidden(x)]
return fnmatch.filter(names, pattern)


def _glob0(client, dirname, basename):
if not basename:
# `os.path.split()` returns an empty basename for paths ending with a
# directory separator. 'q*x/' should match only directories.
if client.status(dirname)['type'] == 'DIRECTORY':
return [basename]
else:
if client.status(posixpath.join(dirname, basename), strict=False):
return [basename]
return []


magic_check = re.compile('([*?[])')
magic_check_bytes = re.compile(b'([*?[])')


def _has_magic(s):
if isinstance(s, bytes):
match = magic_check_bytes.search(s)
else:
match = magic_check.search(s)
return match is not None


def _ishidden(path):
return path[0] in ('.', b'.'[0])

2 changes: 1 addition & 1 deletion scripts/hadoop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ usage() {
hadoop-download() {
local hadoop='hadoop-2.9.2'
cd "$(mktemp -d 2>/dev/null || mktemp -d -t 'hadoop')"
curl -O "https://www-us.apache.org/dist/hadoop/common/${hadoop}/${hadoop}.tar.gz"
curl -O "https://downloads.apache.org/hadoop/common/${hadoop}/${hadoop}.tar.gz"
tar -xzf "${hadoop}.tar.gz"
echo "$(pwd)/${hadoop}"
}
Expand Down
85 changes: 85 additions & 0 deletions test/test_glob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import posixpath

from nose.tools import eq_

from hdfs.glob import glob
from util import _IntegrationTest


class TestGlob(_IntegrationTest):

def setup(self):
super(TestGlob, self).setup()
self.__build_dirs()

def __build_dirs(self):
"""
Structure:

dir_1
dir_1_1
file_1_3_1.txt
dir_1_2
file_1_3_1.txt
dir_1_3
file_1_3_1.txt
file_1_3_2.txt
file_1_3_3.txt
file_1_1.txt
dir_2
dir_2_1
file_2_3_1.txt
dir_2_2
file_2_3_1.txt
dir_2_3
file_2_3_1.txt
file_2_3_2.txt
file_2_3_3.txt
file_2_1.txt
"""
self._write(posixpath.join('dir_1', 'dir_1_1', 'file_1_3_1.txt'), b'file_1_3_1')
self._write(posixpath.join('dir_1', 'dir_1_2', 'file_1_3_1.txt'), b'file_1_3_1')
self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_1.txt'), b'file_1_3_1')
self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_2.txt'), b'file_1_3_2')
self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_3.txt'), b'file_1_3_3')
self._write(posixpath.join('dir_1', 'file_1_1.txt'), b'file_1_1')
self._write(posixpath.join('dir_2', 'dir_2_1', 'file_2_3_1.txt'), b'file_2_3_1')
self._write(posixpath.join('dir_2', 'dir_2_2', 'file_2_2_1.txt'), b'file_2_2_1')
self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_1.txt'), b'file_2_3_1')
self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_2.txt'), b'file_2_3_2')
self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_3.txt'), b'file_2_3_3')
self._write(posixpath.join('dir_2', 'file_2_1.txt'), b'file_2_1')

def test(self):
values = [
('./dir_1/dir_1_3/*', [
'./dir_1/dir_1_3/file_1_3_1.txt',
'./dir_1/dir_1_3/file_1_3_2.txt',
'./dir_1/dir_1_3/file_1_3_3.txt',
]),
('./dir_2/dir_2_3/file_2_3_?.txt', [
'./dir_2/dir_2_3/file_2_3_1.txt',
'./dir_2/dir_2_3/file_2_3_2.txt',
'./dir_2/dir_2_3/file_2_3_3.txt',
]),
('*/*.txt', [
'dir_1/file_1_1.txt',
'dir_2/file_2_1.txt',
]),
('./dir_[1-2]/file_[1-2]_1.txt', [
'./dir_1/file_1_1.txt',
'./dir_2/file_2_1.txt',
]),
('./dir_*/dir_*/file_[1-2]_3_2.txt', [
'./dir_1/dir_1_3/file_1_3_2.txt',
'./dir_2/dir_2_3/file_2_3_2.txt',
]),
('./dir_[3-4]/file_[1-2]_1.txt', []),
('./dir_*/dir_*/file_[3-4]_3_2.txt', []),
]
for pattern, expected in values:
actual = glob(self.client, pattern)
eq_(expected, actual, 'Unexpected result for pattern ' + pattern)