From 7ac51fb928c0177b06a571caab754d9228052e9a Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Sat, 19 Sep 2020 20:01:15 +0200 Subject: [PATCH] Refactor `ReadTarFS` to support internal symlinks --- fs/tarfs.py | 173 ++++++++++++++++++++++++++++++++------------ tests/test_tarfs.py | 132 ++++++++++++++++++++++++++++++--- 2 files changed, 251 insertions(+), 54 deletions(-) diff --git a/fs/tarfs.py b/fs/tarfs.py index d6d6125d..2d12016f 100644 --- a/fs/tarfs.py +++ b/fs/tarfs.py @@ -4,6 +4,7 @@ from __future__ import print_function from __future__ import unicode_literals +import operator import os import tarfile import typing @@ -11,6 +12,7 @@ from typing import cast, IO import six +from six.moves import map from . import errors from .base import FS @@ -22,7 +24,18 @@ from .opener import open_fs from .permissions import Permissions from ._url_tools import url_quote -from .path import relpath, basename, isbase, normpath, parts, frombase +from .path import ( + dirname, + join, + relpath, + basename, + isbase, + normpath, + parts, + frombase, + recursepath, + relativefrom, +) from .wrapfs import WrapFS if typing.TYPE_CHECKING: @@ -255,6 +268,8 @@ class ReadTarFS(FS): tarfile.SYMTYPE: ResourceType.symlink, tarfile.CONTTYPE: ResourceType.file, tarfile.LNKTYPE: ResourceType.symlink, + # this is how we mark implicit directories + tarfile.DIRTYPE + b"i": ResourceType.directory, } @errors.CreateFailed.catch_all @@ -275,24 +290,66 @@ def _directory_entries(self): """Lazy directory cache.""" if self._directory_cache is None: _decode = self._decode + _encode = self._encode + + # collect all directory entries and remove slashes _directory_entries = ( (_decode(info.name).strip("/"), info) for info in self._tar ) - def _list_tar(): - for name, info in _directory_entries: - try: - _name = normpath(name) - except IllegalBackReference: - # Back references outside root, must be up to no good. - pass - else: - if _name: - yield _name, info - - self._directory_cache = OrderedDict(_list_tar()) + # build the cache first before updating it to reduce chances + # of data races + _cache = OrderedDict() + for name, info in _directory_entries: + # check for any invalid back references + try: + _name = normpath(name) + except IllegalBackReference: + continue + + # add all implicit dirnames if not in the cache already + for partial_name in map(relpath, recursepath(_name)): + dirinfo = tarfile.TarInfo(self._encode(partial_name)) + dirinfo.type = tarfile.DIRTYPE + _cache.setdefault(partial_name, dirinfo) + + # add the entry itself, potentially overwriting implicit entries + _cache[_name] = info + + self._directory_cache = _cache return self._directory_cache + def _follow_symlink(self, entry): + """Follow an symlink `TarInfo` to find a concrete entry.""" + _entry = entry + while _entry.issym(): + linkname = normpath( + join(dirname(self._decode(_entry.name)), self._decode(_entry.linkname)) + ) + resolved = self._resolve(linkname) + if resolved is None: + raise errors.ResourceNotFound(linkname) + _entry = self._directory_entries[resolved] + + return _entry + + def _resolve(self, path): + """Replace path components that are symlinks with concrete components. + + Returns: + + + """ + if path in self._directory_entries or not path: + return path + for prefix in map(relpath, reversed(recursepath(path))): + suffix = relativefrom(prefix, path) + entry = self._directory_entries.get(prefix) + if entry is not None and entry.issym(): + entry = self._follow_symlink(entry) + return self._resolve(join(self._decode(entry.name), suffix)) + return None + def __repr__(self): # type: () -> Text return "ReadTarFS({!r})".format(self._file) @@ -327,31 +384,35 @@ def getinfo(self, path, namespaces=None): namespaces = namespaces or () raw_info = {} # type: Dict[Text, Dict[Text, object]] + # special case for root if not _path: raw_info["basic"] = {"name": "", "is_dir": True} if "details" in namespaces: raw_info["details"] = {"type": int(ResourceType.directory)} else: - try: - implicit = False - member = self._directory_entries[_path] - except KeyError: - if not self.isdir(_path): - raise errors.ResourceNotFound(path) - implicit = True - member = tarfile.TarInfo(_path) - member.type = tarfile.DIRTYPE + + _realpath = self._resolve(_path) + if _realpath is None: + raise errors.ResourceNotFound(path) + + implicit = False + member = self._directory_entries[_realpath] raw_info["basic"] = { "name": basename(self._decode(member.name)), - "is_dir": member.isdir(), + "is_dir": self.isdir(_path), # is_dir should follow symlinks } if "link" in namespaces: - raw_info["link"] = { - "target": self._decode(member.linkname) if member.issym() else None - } + if member.issym(): + target = join( + dirname(self._decode(member.name)), + self._decode(member.linkname), + ) + else: + target = None + raw_info["link"] = {"target": target} if "details" in namespaces: raw_info["details"] = { "size": member.size, @@ -381,23 +442,29 @@ def getinfo(self, path, namespaces=None): def isdir(self, path): _path = relpath(self.validatepath(path)) - try: - return self._directory_entries[_path].isdir() - except KeyError: - return any(isbase(_path, name) for name in self._directory_entries) + realpath = self._resolve(_path) + if realpath is not None: + entry = self._directory_entries[realpath] + return self._follow_symlink(entry).isdir() + else: + return False def isfile(self, path): _path = relpath(self.validatepath(path)) - try: - return self._directory_entries[_path].isfile() - except KeyError: + realpath = self._resolve(_path) + if realpath is not None: + entry = self._directory_entries[realpath] + return self._follow_symlink(entry).isfile() + else: return False def islink(self, path): _path = relpath(self.validatepath(path)) - try: - return self._directory_entries[_path].issym() - except KeyError: + realpath = self._resolve(_path) + if realpath is not None: + entry = self._directory_entries[realpath] + return entry.issym() + else: return False def setinfo(self, path, info): @@ -409,13 +476,28 @@ def listdir(self, path): # type: (Text) -> List[Text] _path = relpath(self.validatepath(path)) - if not self.gettype(path) is ResourceType.directory: - raise errors.DirectoryExpected(path) + # check the given path exists + realpath = self._resolve(_path) + if realpath is None: + raise errors.ResourceNotFound(path) + elif realpath: + target = self._follow_symlink(self._directory_entries[realpath]) + # check the path is either a symlink mapping to a directory or a directory + if target.isdir(): + base = target.name + elif target.issym(): + base = target.linkname + else: + raise errors.DirectoryExpected(path) + else: + base = "" + # find all entries in the actual directory children = ( - frombase(_path, n) for n in self._directory_entries if isbase(_path, n) + frombase(base, n) for n in self._directory_entries if isbase(base, n) ) content = (parts(child)[1] for child in children if relpath(child)) + return list(OrderedDict.fromkeys(content)) def makedir( @@ -432,17 +514,18 @@ def openbin(self, path, mode="r", buffering=-1, **options): # type: (Text, Text, int, **Any) -> BinaryIO _path = relpath(self.validatepath(path)) + # check the requested mode is only a reading mode if "w" in mode or "+" in mode or "a" in mode: raise errors.ResourceReadOnly(path) - try: - member = self._directory_entries[_path] - except KeyError: - six.raise_from(errors.ResourceNotFound(path), None) + # check the path actually resolves after following symlinks + _realpath = self._resolve(_path) + if _realpath is None: + raise errors.ResourceNotFound(path) - # TarFile.extractfile returns None if the entry is + # TarFile.extractfile returns None if the entry is not a file # neither a file nor a symlink - reader = self._tar.extractfile(member) + reader = self._tar.extractfile(self._directory_entries[_realpath]) if reader is None: raise errors.FileExpected(path) diff --git a/tests/test_tarfs.py b/tests/test_tarfs.py index a90dc0ea..3e6bab72 100644 --- a/tests/test_tarfs.py +++ b/tests/test_tarfs.py @@ -1,13 +1,13 @@ # -*- encoding: UTF-8 from __future__ import unicode_literals -import io import os import six import tarfile import tempfile import unittest import pytest +from six import BytesIO from fs import tarfs from fs.enums import ResourceType @@ -223,8 +223,8 @@ def tearDownClass(cls): def setUp(self): self.tempfile = self.tmpfs.open("test.tar", "wb+") with tarfile.open(mode="w", fileobj=self.tempfile) as tf: - tf.addfile(tarfile.TarInfo("."), io.StringIO()) - tf.addfile(tarfile.TarInfo("../foo.txt"), io.StringIO()) + tf.addfile(tarfile.TarInfo(".")) + tf.addfile(tarfile.TarInfo("../foo.txt")) self.tempfile.seek(0) self.fs = tarfs.TarFS(self.tempfile) @@ -237,8 +237,7 @@ def test_listdir(self): class TestImplicitDirectories(unittest.TestCase): - """Regression tests for #160. - """ + """Regression tests for #160.""" @classmethod def setUpClass(cls): @@ -251,12 +250,12 @@ def tearDownClass(cls): def setUp(self): self.tempfile = self.tmpfs.open("test.tar", "wb+") with tarfile.open(mode="w", fileobj=self.tempfile) as tf: - tf.addfile(tarfile.TarInfo("foo/bar/baz/spam.txt"), io.StringIO()) - tf.addfile(tarfile.TarInfo("./foo/eggs.bin"), io.StringIO()) - tf.addfile(tarfile.TarInfo("./foo/yolk/beans.txt"), io.StringIO()) + tf.addfile(tarfile.TarInfo("foo/bar/baz/spam.txt")) + tf.addfile(tarfile.TarInfo("./foo/eggs.bin")) + tf.addfile(tarfile.TarInfo("./foo/yolk/beans.txt")) info = tarfile.TarInfo("foo/yolk") info.type = tarfile.DIRTYPE - tf.addfile(info, io.BytesIO()) + tf.addfile(info) self.tempfile.seek(0) self.fs = tarfs.TarFS(self.tempfile) @@ -301,6 +300,121 @@ def test_getinfo(self): self.assertIs(info.type, ResourceType.directory) +class TestSymlinks(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.tmpfs = open_fs("temp://") + + @classmethod + def tearDownClass(cls): + cls.tmpfs.close() + + def setUp(self): + def _info(name, **kwargs): + info = tarfile.TarInfo(name) + for k, v in kwargs.items(): + setattr(info, k, v) + return info + + # /foo + # /foo/bar.txt + # /foo/baz.txt -> /foo/bar.txt + # /spam -> /foo + # /eggs + # /eggs/yolk -> /spam + + self.tempfile = self.tmpfs.open("test.tar", "wb+") + with tarfile.open(mode="w", fileobj=self.tempfile) as tf: + tf.addfile(_info("foo", type=tarfile.DIRTYPE)) + buff = BytesIO(b"hello") + tf.addfile(_info("foo/bar.txt", size=len(buff.getvalue())), buff) + tf.addfile(_info("foo/baz.txt", type=tarfile.SYMTYPE, linkname="bar.txt")) + tf.addfile(_info("spam", type=tarfile.SYMTYPE, linkname="foo")) + tf.addfile(_info("eggs", type=tarfile.DIRTYPE)) + tf.addfile(_info("eggs/yolk", type=tarfile.SYMTYPE, linkname="../spam")) + self.tempfile.seek(0) + self.fs = tarfs.TarFS(self.tempfile) + + def tearDown(self): + self.fs.close() + self.tempfile.close() + + def test_openbin(self): + # read an actual file + with self.fs.openbin("foo/bar.txt") as bar: + self.assertEqual(bar.read(), b"hello") + # read a link to an actual file + with self.fs.openbin("foo/baz.txt") as baz: + self.assertEqual(baz.read(), b"hello") + # read an actual file via a linked directory + with self.fs.openbin("spam/bar.txt") as bar: + self.assertEqual(bar.read(), b"hello") + # read a link via a linked directory + with self.fs.openbin("spam/baz.txt") as baz: + self.assertEqual(baz.read(), b"hello") + + def test_isfile(self): + self.assertFalse(self.fs.isfile("foo")) + self.assertFalse(self.fs.isfile("spam")) + self.assertFalse(self.fs.isfile("eggs")) + self.assertFalse(self.fs.isfile("eggs/yolk")) + self.assertTrue(self.fs.isfile("foo/bar.txt")) + self.assertTrue(self.fs.isfile("foo/baz.txt")) + self.assertTrue(self.fs.isfile("eggs/yolk/bar.txt")) + self.assertTrue(self.fs.isfile("eggs/yolk/baz.txt")) + + def test_isdir(self): + self.assertTrue(self.fs.isdir("foo")) + self.assertTrue(self.fs.isdir("spam")) + self.assertTrue(self.fs.isdir("eggs/yolk")) + self.assertFalse(self.fs.isdir("foo/bar.txt")) + self.assertFalse(self.fs.isdir("foo/baz.txt")) + self.assertFalse(self.fs.isdir("eggs/yolk/bar.txt")) + self.assertFalse(self.fs.isdir("eggs/yolk/baz.txt")) + + def test_islink(self): + self.assertFalse(self.fs.islink("foo")) + self.assertTrue(self.fs.islink("spam")) + self.assertTrue(self.fs.islink("eggs/yolk")) + self.assertFalse(self.fs.islink("foo/bar.txt")) + self.assertTrue(self.fs.islink("foo/baz.txt")) + self.assertFalse(self.fs.islink("eggs/yolk/bar.txt")) + self.assertTrue(self.fs.islink("eggs/yolk/baz.txt")) + + def test_getinfo(self): + file_info = self.fs.getinfo("foo/bar.txt", namespaces=("details", "link")) + self.assertIn("details", file_info.namespaces) + self.assertIn("link", file_info.namespaces) + self.assertFalse(file_info.is_dir) + self.assertIs(file_info.target, None) + self.assertEqual(file_info.type, ResourceType.file) + + link_info = self.fs.getinfo("foo/baz.txt", namespaces=("details", "link")) + self.assertIn("details", link_info.namespaces) + self.assertIn("link", link_info.namespaces) + self.assertFalse(link_info.is_dir) + self.assertEqual(link_info.target, "foo/bar.txt") + self.assertEqual(link_info.type, ResourceType.symlink) + + dir_info = self.fs.getinfo("foo", namespaces=("details", "link")) + self.assertIn("details", dir_info.namespaces) + self.assertIn("link", dir_info.namespaces) + self.assertTrue(dir_info.is_dir) + self.assertEqual(dir_info.target, None) + self.assertEqual(dir_info.type, ResourceType.directory) + + dirlink_info = self.fs.getinfo("spam", namespaces=("details", "link")) + self.assertIn("details", dirlink_info.namespaces) + self.assertIn("link", dirlink_info.namespaces) + self.assertTrue(dirlink_info.is_dir) + self.assertEqual(dirlink_info.target, "foo") + self.assertEqual(dirlink_info.type, ResourceType.symlink) + + def test_listdir(self): + self.assertEqual(sorted(self.fs.listdir("foo")), ["bar.txt", "baz.txt"]) + self.assertEqual(sorted(self.fs.listdir("spam")), ["bar.txt", "baz.txt"]) + + class TestReadTarFSMem(TestReadTarFS): def make_source_fs(self): return open_fs("mem://")