diff --git a/CHANGELOG.md b/CHANGELOG.md index 7597621a..4b8f126a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ## [2.4.12] - (Unreleased) +### Added + +- Symlink support to `ReadTarFS` [#426](https://github.com/PyFilesystem/pyfilesystem2/pull/426). Closes [#409](https://github.com/PyFilesystem/pyfilesystem2/issues/409). + ### Changed - Start testing on PyPy. Due to [#342](https://github.com/PyFilesystem/pyfilesystem2/issues/342) diff --git a/fs/tarfs.py b/fs/tarfs.py index 4f48d821..7577c2b3 100644 --- a/fs/tarfs.py +++ b/fs/tarfs.py @@ -8,9 +8,9 @@ import tarfile import typing from collections import OrderedDict -from typing import cast, IO import six +from six.moves import map from . import errors from .base import FS @@ -22,7 +22,18 @@ from .opener import open_fs from .permissions import Permissions from ._url_tools import url_quote -from .path import relpath, basename, isbase, normpath, parts, frombase +from .path import ( + dirname, + join, + relpath, + basename, + isbase, + normpath, + parts, + frombase, + recursepath, + relativefrom, +) from .wrapfs import WrapFS if typing.TYPE_CHECKING: @@ -157,8 +168,7 @@ def __init__( @six.python_2_unicode_compatible class WriteTarFS(WrapFS): - """A writable tar file. - """ + """A writable tar file.""" def __init__( self, @@ -234,8 +244,7 @@ def write_tar( @six.python_2_unicode_compatible class ReadTarFS(FS): - """A readable tar file. - """ + """A readable tar file.""" _meta = { "case_insensitive": True, @@ -257,6 +266,8 @@ class ReadTarFS(FS): tarfile.SYMTYPE: ResourceType.symlink, tarfile.CONTTYPE: ResourceType.file, tarfile.LNKTYPE: ResourceType.symlink, + # this is how we mark implicit directories + tarfile.DIRTYPE + b"i": ResourceType.directory, } @errors.CreateFailed.catch_all @@ -277,24 +288,74 @@ def _directory_entries(self): """Lazy directory cache.""" if self._directory_cache is None: _decode = self._decode + _encode = self._encode + + # collect all directory entries and remove slashes _directory_entries = ( (_decode(info.name).strip("/"), info) for info in self._tar ) - def _list_tar(): - for name, info in _directory_entries: - try: - _name = normpath(name) - except IllegalBackReference: - # Back references outside root, must be up to no good. - pass - else: - if _name: - yield _name, info - - self._directory_cache = OrderedDict(_list_tar()) + # build the cache first before updating it to reduce chances + # of data races + _cache = OrderedDict() + for name, info in _directory_entries: + # check for any invalid back references + try: + _name = normpath(name) + except IllegalBackReference: + continue + + # add all implicit dirnames if not in the cache already + for partial_name in map(relpath, recursepath(_name)): + dirinfo = tarfile.TarInfo(_encode(partial_name)) + dirinfo.type = tarfile.DIRTYPE + _cache.setdefault(partial_name, dirinfo) + + # add the entry itself, potentially overwriting implicit entries + _cache[_name] = info + + self._directory_cache = _cache return self._directory_cache + def _follow_symlink(self, entry): + """Follow an symlink `TarInfo` to find a concrete entry. + + Returns ``None`` if the symlink is dangling. + """ + done = set() + _entry = entry + while _entry.issym(): + linkname = normpath( + join(dirname(self._decode(_entry.name)), self._decode(_entry.linkname)) + ) + resolved = self._resolve(linkname) + if resolved is None: + return None + done.add(_entry) + _entry = self._directory_entries[resolved] + # if we already saw this symlink, then we are following cyclic + # symlinks and we should break the loop + if _entry in done: + return None + + return _entry + + def _resolve(self, path): + """Replace path components that are symlinks with concrete components. + + Returns ``None`` when the path could not be resolved to an existing + entry in the archive. + """ + if path in self._directory_entries or not path: + return path + for prefix in map(relpath, reversed(recursepath(path))): + suffix = relativefrom(prefix, path) + entry = self._directory_entries.get(prefix) + if entry is not None and entry.issym(): + entry = self._follow_symlink(entry) + return self._resolve(join(self._decode(entry.name), suffix)) + return None + def __repr__(self): # type: () -> Text return "ReadTarFS({!r})".format(self._file) @@ -329,27 +390,34 @@ def getinfo(self, path, namespaces=None): namespaces = namespaces or () raw_info = {} # type: Dict[Text, Dict[Text, object]] + # special case for root if not _path: raw_info["basic"] = {"name": "", "is_dir": True} if "details" in namespaces: raw_info["details"] = {"type": int(ResourceType.directory)} else: - try: - implicit = False - member = self._directory_entries[_path] - except KeyError: - if not self.isdir(_path): - raise errors.ResourceNotFound(path) - implicit = True - member = tarfile.TarInfo(_path) - member.type = tarfile.DIRTYPE + _realpath = self._resolve(_path) + if _realpath is None: + raise errors.ResourceNotFound(path) + + implicit = False + member = self._directory_entries[_realpath] raw_info["basic"] = { "name": basename(self._decode(member.name)), - "is_dir": member.isdir(), + "is_dir": self.isdir(_path), # is_dir should follow symlinks } + if "link" in namespaces: + if member.issym(): + target = normpath(join( + dirname(self._decode(member.name)), + self._decode(member.linkname), + )) # type: Optional[Text] + else: + target = None + raw_info["link"] = {"target": target} if "details" in namespaces: raw_info["details"] = { "size": member.size, @@ -379,16 +447,29 @@ def getinfo(self, path, namespaces=None): def isdir(self, path): _path = relpath(self.validatepath(path)) - try: - return self._directory_entries[_path].isdir() - except KeyError: - return any(isbase(_path, name) for name in self._directory_entries) + realpath = self._resolve(_path) + if realpath is not None: + entry = self._follow_symlink(self._directory_entries[realpath]) + return False if entry is None else entry.isdir() + else: + return False def isfile(self, path): _path = relpath(self.validatepath(path)) - try: - return self._directory_entries[_path].isfile() - except KeyError: + realpath = self._resolve(_path) + if realpath is not None: + entry = self._follow_symlink(self._directory_entries[realpath]) + return False if entry is None else entry.isfile() + else: + return False + + def islink(self, path): + _path = relpath(self.validatepath(path)) + realpath = self._resolve(_path) + if realpath is not None: + entry = self._directory_entries[realpath] + return entry.issym() + else: return False def setinfo(self, path, info): @@ -400,13 +481,28 @@ def listdir(self, path): # type: (Text) -> List[Text] _path = relpath(self.validatepath(path)) - if not self.gettype(path) is ResourceType.directory: - raise errors.DirectoryExpected(path) + # check the given path exists + realpath = self._resolve(_path) + if realpath is None: + raise errors.ResourceNotFound(path) + elif realpath: + target = self._follow_symlink(self._directory_entries[realpath]) + # check the path is either a symlink mapping to a directory or a directory + if target is None: + raise errors.ResourceNotFound(path) + elif not target.isdir(): + raise errors.DirectoryExpected(path) + else: + base = target.name + else: + base = "" + # find all entries in the actual directory children = ( - frombase(_path, n) for n in self._directory_entries if isbase(_path, n) + frombase(base, n) for n in self._directory_entries if isbase(base, n) ) content = (parts(child)[1] for child in children if relpath(child)) + return list(OrderedDict.fromkeys(content)) def makedir( @@ -423,19 +519,27 @@ def openbin(self, path, mode="r", buffering=-1, **options): # type: (Text, Text, int, **Any) -> BinaryIO _path = relpath(self.validatepath(path)) + # check the requested mode is only a reading mode if "w" in mode or "+" in mode or "a" in mode: raise errors.ResourceReadOnly(path) - try: - member = self._directory_entries[_path] - except KeyError: - six.raise_from(errors.ResourceNotFound(path), None) + # check the path actually resolves after following symlink components + _realpath = self._resolve(_path) + if _realpath is None: + raise errors.ResourceNotFound(path) - if not member.isfile(): - raise errors.FileExpected(path) + # get the entry at the resolved path and follow all symlinks + entry = self._follow_symlink(self._directory_entries[_realpath]) + if entry is None: + raise errors.ResourceNotFound(path) - rw = RawWrapper(cast(IO, self._tar.extractfile(member))) + # TarFile.extractfile returns None if the entry is not a file + # neither a file nor a symlink + reader = self._tar.extractfile(self._directory_entries[_realpath]) + if reader is None: + raise errors.FileExpected(path) + rw = RawWrapper(reader) if six.PY2: # Patch nonexistent file.flush in Python2 def _flush(): diff --git a/tests/test_tarfs.py b/tests/test_tarfs.py index a90dc0ea..2ebced5a 100644 --- a/tests/test_tarfs.py +++ b/tests/test_tarfs.py @@ -1,20 +1,20 @@ # -*- encoding: UTF-8 from __future__ import unicode_literals -import io import os import six import tarfile import tempfile import unittest import pytest +from six import BytesIO from fs import tarfs from fs.enums import ResourceType from fs.compress import write_tar from fs.opener import open_fs from fs.opener.errors import NotWriteable -from fs.errors import NoURL +from fs.errors import NoURL, ResourceNotFound from fs.test import FSTestCases from .test_archives import ArchiveTestCases @@ -223,8 +223,8 @@ def tearDownClass(cls): def setUp(self): self.tempfile = self.tmpfs.open("test.tar", "wb+") with tarfile.open(mode="w", fileobj=self.tempfile) as tf: - tf.addfile(tarfile.TarInfo("."), io.StringIO()) - tf.addfile(tarfile.TarInfo("../foo.txt"), io.StringIO()) + tf.addfile(tarfile.TarInfo(".")) + tf.addfile(tarfile.TarInfo("../foo.txt")) self.tempfile.seek(0) self.fs = tarfs.TarFS(self.tempfile) @@ -237,8 +237,7 @@ def test_listdir(self): class TestImplicitDirectories(unittest.TestCase): - """Regression tests for #160. - """ + """Regression tests for #160.""" @classmethod def setUpClass(cls): @@ -251,12 +250,12 @@ def tearDownClass(cls): def setUp(self): self.tempfile = self.tmpfs.open("test.tar", "wb+") with tarfile.open(mode="w", fileobj=self.tempfile) as tf: - tf.addfile(tarfile.TarInfo("foo/bar/baz/spam.txt"), io.StringIO()) - tf.addfile(tarfile.TarInfo("./foo/eggs.bin"), io.StringIO()) - tf.addfile(tarfile.TarInfo("./foo/yolk/beans.txt"), io.StringIO()) + tf.addfile(tarfile.TarInfo("foo/bar/baz/spam.txt")) + tf.addfile(tarfile.TarInfo("./foo/eggs.bin")) + tf.addfile(tarfile.TarInfo("./foo/yolk/beans.txt")) info = tarfile.TarInfo("foo/yolk") info.type = tarfile.DIRTYPE - tf.addfile(info, io.BytesIO()) + tf.addfile(info) self.tempfile.seek(0) self.fs = tarfs.TarFS(self.tempfile) @@ -301,6 +300,169 @@ def test_getinfo(self): self.assertIs(info.type, ResourceType.directory) +class TestSymlinks(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.tmpfs = open_fs("temp://") + + @classmethod + def tearDownClass(cls): + cls.tmpfs.close() + + def setUp(self): + def _info(name, **kwargs): + info = tarfile.TarInfo(name) + for k, v in kwargs.items(): + setattr(info, k, v) + return info + + # /foo + # /foo/bar.txt + # /foo/baz.txt -> /foo/bar.txt + # /spam -> /foo + # /eggs + # /eggs/yolk -> /spam + + self.tempfile = self.tmpfs.open("test.tar", "wb+") + with tarfile.open(mode="w", fileobj=self.tempfile) as tf: + tf.addfile(_info("foo", type=tarfile.DIRTYPE)) + buff = BytesIO(b"hello") + tf.addfile(_info("foo/bar.txt", size=len(buff.getvalue())), buff) + tf.addfile(_info("foo/baz.txt", type=tarfile.SYMTYPE, linkname="bar.txt")) + tf.addfile(_info("spam", type=tarfile.SYMTYPE, linkname="foo")) + tf.addfile(_info("eggs", type=tarfile.DIRTYPE)) + tf.addfile(_info("eggs/yolk", type=tarfile.SYMTYPE, linkname="../spam")) + self.tempfile.seek(0) + self.fs = tarfs.TarFS(self.tempfile) + + def tearDown(self): + self.fs.close() + self.tempfile.close() + + def test_openbin(self): + # read an actual file + with self.fs.openbin("foo/bar.txt") as bar: + self.assertEqual(bar.read(), b"hello") + # read a link to an actual file + with self.fs.openbin("foo/baz.txt") as baz: + self.assertEqual(baz.read(), b"hello") + # read an actual file via a linked directory + with self.fs.openbin("spam/bar.txt") as bar: + self.assertEqual(bar.read(), b"hello") + # read a link via a linked directory + with self.fs.openbin("spam/baz.txt") as baz: + self.assertEqual(baz.read(), b"hello") + + def test_isfile(self): + self.assertFalse(self.fs.isfile("foo")) + self.assertFalse(self.fs.isfile("spam")) + self.assertFalse(self.fs.isfile("eggs")) + self.assertFalse(self.fs.isfile("eggs/yolk")) + self.assertTrue(self.fs.isfile("foo/bar.txt")) + self.assertTrue(self.fs.isfile("foo/baz.txt")) + self.assertTrue(self.fs.isfile("eggs/yolk/bar.txt")) + self.assertTrue(self.fs.isfile("eggs/yolk/baz.txt")) + + def test_isdir(self): + self.assertTrue(self.fs.isdir("foo")) + self.assertTrue(self.fs.isdir("spam")) + self.assertTrue(self.fs.isdir("eggs/yolk")) + self.assertFalse(self.fs.isdir("foo/bar.txt")) + self.assertFalse(self.fs.isdir("foo/baz.txt")) + self.assertFalse(self.fs.isdir("eggs/yolk/bar.txt")) + self.assertFalse(self.fs.isdir("eggs/yolk/baz.txt")) + + def test_islink(self): + self.assertFalse(self.fs.islink("foo")) + self.assertTrue(self.fs.islink("spam")) + self.assertTrue(self.fs.islink("eggs/yolk")) + self.assertFalse(self.fs.islink("foo/bar.txt")) + self.assertTrue(self.fs.islink("foo/baz.txt")) + self.assertFalse(self.fs.islink("eggs/yolk/bar.txt")) + self.assertTrue(self.fs.islink("eggs/yolk/baz.txt")) + + def test_getinfo(self): + file_info = self.fs.getinfo("foo/bar.txt", namespaces=("details", "link")) + self.assertIn("details", file_info.namespaces) + self.assertIn("link", file_info.namespaces) + self.assertFalse(file_info.is_dir) + self.assertIs(file_info.target, None) + self.assertEqual(file_info.type, ResourceType.file) + + link_info = self.fs.getinfo("foo/baz.txt", namespaces=("details", "link")) + self.assertIn("details", link_info.namespaces) + self.assertIn("link", link_info.namespaces) + self.assertFalse(link_info.is_dir) + self.assertEqual(link_info.target, "foo/bar.txt") + self.assertEqual(link_info.type, ResourceType.symlink) + + dir_info = self.fs.getinfo("foo", namespaces=("details", "link")) + self.assertIn("details", dir_info.namespaces) + self.assertIn("link", dir_info.namespaces) + self.assertTrue(dir_info.is_dir) + self.assertEqual(dir_info.target, None) + self.assertEqual(dir_info.type, ResourceType.directory) + + dirlink_info = self.fs.getinfo("spam", namespaces=("details", "link")) + self.assertIn("details", dirlink_info.namespaces) + self.assertIn("link", dirlink_info.namespaces) + self.assertTrue(dirlink_info.is_dir) + self.assertEqual(dirlink_info.target, "foo") + self.assertEqual(dirlink_info.type, ResourceType.symlink) + + def test_listdir(self): + self.assertEqual(sorted(self.fs.listdir("foo")), ["bar.txt", "baz.txt"]) + self.assertEqual(sorted(self.fs.listdir("spam")), ["bar.txt", "baz.txt"]) + + +class TestBrokenSymlinks(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.tmpfs = open_fs("temp://") + + @classmethod + def tearDownClass(cls): + cls.tmpfs.close() + + def setUp(self): + def _info(name, **kwargs): + info = tarfile.TarInfo(name) + for k, v in kwargs.items(): + setattr(info, k, v) + return info + + # /foo + # /foo/baz.txt -> /foo/bar.txt + # /spam -> /eggs + # /eggs -> /spam + + self.tempfile = self.tmpfs.open("test.tar", "wb+") + with tarfile.open(mode="w", fileobj=self.tempfile) as tf: + tf.addfile(_info("foo", type=tarfile.DIRTYPE)) + tf.addfile(_info("foo/baz.txt", type=tarfile.SYMTYPE, linkname="bar.txt")) + tf.addfile(_info("spam", type=tarfile.SYMTYPE, linkname="eggs")) + tf.addfile(_info("eggs", type=tarfile.SYMTYPE, linkname="spam")) + self.tempfile.seek(0) + self.fs = tarfs.TarFS(self.tempfile) + + def tearDown(self): + self.fs.close() + self.tempfile.close() + + def test_dangling(self): + self.assertFalse(self.fs.isfile("foo/baz.txt")) + self.assertFalse(self.fs.isdir("foo/baz.txt")) + self.assertRaises(ResourceNotFound, self.fs.openbin, "foo/baz.txt") + self.assertRaises(ResourceNotFound, self.fs.listdir, "foo/baz.txt") + + def test_cyclic(self): + self.assertFalse(self.fs.isfile("spam")) + self.assertFalse(self.fs.isdir("spam")) + self.assertRaises(ResourceNotFound, self.fs.openbin, "spam") + self.assertRaises(ResourceNotFound, self.fs.listdir, "spam") + + class TestReadTarFSMem(TestReadTarFS): def make_source_fs(self): return open_fs("mem://")