diff --git a/.gitattributes b/.gitattributes index 63cf10a..5d87832 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1 @@ -tests/data/basic.wim.gz filter=lfs diff=lfs merge=lfs -text +tests/data/* filter=lfs diff=lfs merge=lfs -text diff --git a/dissect/archive/c_vbk.py b/dissect/archive/c_vbk.py new file mode 100644 index 0000000..966336c --- /dev/null +++ b/dissect/archive/c_vbk.py @@ -0,0 +1,290 @@ +from dissect.cstruct import cstruct + +vbk_def = """ +#define PAGE_SIZE 4096 + +/* Storage header */ + +struct StorageHeader { + uint32 FormatVersion; /* 0x0000 */ + uint32 Initialized; /* 0x0004 */ + uint32 DigestTypeLength; /* 0x0008 */ + char DigestType[251]; /* 0x000C */ + uint32 SnapshotSlotFormat; /* 0x0107 format > 5 -> crc32c */ + uint32 StandardBlockSize; /* 0x010B */ + uint8 ClusterAlign; /* 0x010F */ + char Unk0[16]; /* 0x0120 */ + char ExternalStorageId[16]; /* 0x0130 */ +}; + +/* Snapshot header */ + +struct SnapshotSlotHeader { + uint32 CRC; + uint32 ContainsSnapshot; +}; + +struct DirectoryRootRecord { + int64 RootPage; /* Root page of the directory */ + uint64 Count; /* Number of children */ +}; + +struct BlocksStoreHeader { + int64 RootPage; /* Root of the blocks store */ + uint64 Count; /* Number of blocks store entries */ + int64 FreeRootPage; /* Root of the free blocks tree */ + int64 DeduplicationRootPage; /* Root of the deduplication tree */ + int64 Unk0; + int64 Unk1; +}; + +struct CryptoStoreRecord { + int64 RootPage; /* Root of the crypto store */ +}; + +struct SnapshotDescriptor { + uint64 Version; /* Acts as a sequence number, highest is active slot */ + uint64 StorageEOF; /* End of file, aka file size */ + uint32 BanksCount; /* Number of banks */ + DirectoryRootRecord DirectoryRoot; /* Directory root record */ + BlocksStoreHeader BlocksStore; /* Blocks store header */ + CryptoStoreRecord CryptoStore; /* Crypto store record */ + uint64 Unk0; + uint64 Unk1; +}; + +struct BankDescriptor { + uint32 CRC; + uint64 Offset; + uint32 Size; +}; + +struct BanksGrain { + uint32 MaxBanks; + uint32 StoredBanks; + // BankDescriptor Banks[StoredBanks]; +}; + +/* Block headers */ + +struct BankHeader { + uint16 PageCount; + uint16 Flags; + char Unk0[3064]; + uint64 Unk1; + char Unk2[1020]; +}; + +struct BankHeaderV71 { + uint16 PageCount; + uint16 Flags; /* 2 == encrypted */ + char Unk0[3072]; + char KeySetId[16]; + char Unk1[16]; + char Unk2[16]; + uint32 Unk3; + char Unk4[968]; +}; + +struct MetaBlobHeader { + int64 NextPage; + int32 Unk0; +}; + +struct Lz4BlockHeader { + uint32 Magic; /* 0xF800000F */ + uint32 CRC; /* CRC32C of the compressed data */ + uint32 SourceSize; +}; + +/* DirItem headers */ +struct BlocksVectorHeader { + uint64 RootPage; + uint64 Count; +}; + +struct SubFolderHeader { + uint64 RootPage; /* 0x94 */ + uint32 Count; /* 0x9C */ + char Data[32]; /* 0xA0 */ +}; /* 0xC0 */ + +struct ExtFibHeader { + uint16 UpdateInProgress; /* 0x94 */ + uint8 Unk3; /* 0x96 */ + uint8 Format; /* 0x97 Bit 3 == 1 */ + BlocksVectorHeader BlocksVector; /* 0x98 */ + uint64 FibSize; /* 0xA8 */ + uint64 Size; /* 0xB0 */ + uint8 FsObjAttachState; /* 0xB8 */ + char Data[7]; /* 0xB9 */ +}; /* 0xC0 */ + +struct IntFibHeader { + uint16 UpdateInProgress; /* 0x94 */ + uint8 Unk3; /* 0x96 */ + uint8 Format; /* 0x97 Bit 3 == 1 */ + BlocksVectorHeader BlocksVector; /* 0x98 */ + uint64 FibSize; /* 0xA8 */ + uint64 Size; /* 0xB0 */ + uint8 FsObjAttachState; /* 0xB8 */ + char Data[7]; /* 0xB9 */ +}; /* 0xC0 */ + +struct PatchHeader { + uint32 Unk0; /* 0x94 */ + BlocksVectorHeader BlocksVector; /* 0x98 */ + uint64 FibSize; /* 0xA8 Source file size */ + uint64 Unk4; /* 0xB0 */ + char Data[8]; /* 0xB8 */ +}; /* 0xC0 */ + +struct IncrementHeader { + uint32 Unk0; /* 0x94 */ + BlocksVectorHeader BlocksVector; /* 0x98 */ + uint64 FibSize; /* 0xA8 Original FIB size */ + uint64 Unk4; /* 0xB0 */ + char Data[8]; /* 0xB8 */ +}; /* 0xC0 */ + +enum DirItemType : uint32 { + None = 0, + SubFolder = 1, + ExtFib = 2, + IntFib = 3, + Patch = 4, + Increment = 5, +}; + +struct DirItemRecord { + DirItemType Type; /* 0x00 */ + uint32 NameLength; /* 0x04 */ + char Name[128]; /* 0x08 */ + int64 PropsRootPage; /* 0x88 */ + uint32 Unk1; /* 0x90 */ + union { /* 0x94 */ + char Data[44]; + SubFolderHeader SubFolder; + ExtFibHeader ExtFib; + IntFibHeader IntFib; + PatchHeader Patch; + IncrementHeader Increment; + }; +}; + +/* Block descriptors */ + +flag BlockFlags : uint8 { + None = 0x00, + Updated = 0x01, + CommitInProgress = 0x02, +}; + +enum BlockLocationType : uint8 { + Normal = 0x00, + Sparse = 0x01, + Reserved = 0x02, + Archived = 0x03, /* CompressedSize | (CompressionType << 32) */ + BlockInBlob = 0x04, /* BlockId? & 0x3FFFFFF | (BlobId << 26) | ((Offset >> 9) << 42) */ + BlockInBlobReserved = 0x05, /* BlockId? | 0xFFFFFFFFFC000000 */ +}; + +enum CompressionType : int8 { + Plain = -1, + RL = 2, + ZLH = 3, + ZLL = 4, + LZ4 = 7, +}; + +struct MetaTableDescriptor { + int64 RootPage; + uint64 BlockSize; + uint64 Count; +}; + +struct StgBlockDescriptor { + uint8 Format; /* Format != 4 == legacy */ + uint32 UsageCounter; + uint64 Offset; + uint32 AllocatedSize; + uint8 Deduplication; + char Digest[16]; + CompressionType CompressionType; + uint8 Unk0; + uint32 CompressedSize; + uint32 SourceSize; +}; + +struct StgBlockDescriptorV7 { + uint8 Format; /* Format != 4 == legacy */ + uint32 UsageCounter; + uint64 Offset; + uint32 AllocatedSize; + uint8 Deduplication; + char Digest[16]; + CompressionType CompressionType; + uint8 Unk0; + uint32 CompressedSize; + uint32 SourceSize; + char KeySetId[16]; +}; + +struct FibBlockDescriptor { + uint32 BlockSize; + BlockLocationType Type; + char Digest[16]; + // union { + // struct { + // uint32 ArchiveUsedSize; + // uint8 ArchiveCompressionType; + // uint8 Unk3; + // uint16 Unk4; + // } Archived; + // uint64 Offset; + // }; + uint64 BlockId; /* For performance reasons we just put a uint64 here, but this is actually a union */ + BlockFlags Flags; +}; + +struct FibBlockDescriptorV7 { + uint32 BlockSize; + BlockLocationType Type; + char Digest[16]; + // union { + // struct { + // uint32 ArchiveUsedSize; + // uint8 ArchiveCompressionType; + // uint8 Unk3; + // uint16 Unk4; + // } Archived; + // uint64 Offset; + // }; + uint64 BlockId; /* For performance reasons we just put a uint64 here, but this is actually a union */ + BlockFlags Flags; + char KeySetId[16]; +}; + +struct PatchBlockDescriptor { +}; + +struct PatchBlockDescriptorV7 { +}; + +/* Property dictionary */ + +enum PropertyType : int32 { + UInt32 = 1, + UInt64 = 2, + AString = 3, + WString = 4, + Binary = 5, + Boolean = 6, + End = -1, +}; +""" # noqa: E501 + +c_vbk = cstruct().load(vbk_def) + +PAGE_SIZE = c_vbk.PAGE_SIZE +"""VBK page size.""" diff --git a/dissect/archive/c_vma.py b/dissect/archive/c_vma.py new file mode 100644 index 0000000..a071709 --- /dev/null +++ b/dissect/archive/c_vma.py @@ -0,0 +1,60 @@ +from dissect.cstruct import cstruct + +vma_def = """ +#define VMA_BLOCK_BITS 12 +#define VMA_BLOCK_SIZE (1 << VMA_BLOCK_BITS) +#define VMA_CLUSTER_BITS (VMA_BLOCK_BITS + 4) +#define VMA_CLUSTER_SIZE (1 << VMA_CLUSTER_BITS) + +#define VMA_EXTENT_HEADER_SIZE 512 +#define VMA_BLOCKS_PER_EXTENT 59 +#define VMA_MAX_CONFIGS 256 + +#define VMA_MAX_EXTENT_SIZE (VMA_EXTENT_HEADER_SIZE + VMA_CLUSTER_SIZE * VMA_BLOCKS_PER_EXTENT) + +/* File Format Definitions */ + +struct VmaDeviceInfoHeader { + uint32 devname_ptr; /* offset into blob_buffer table */ + uint32 reserved0; + uint64 size; /* device size in bytes */ + uint64 reserved1; + uint64 reserved2; +}; + +struct VmaHeader { + char magic[4]; + uint32 version; + char uuid[16]; + int64 ctime; + char md5sum[16]; + + uint32 blob_buffer_offset; + uint32 blob_buffer_size; + uint32 header_size; + + char _reserved1[1984]; + + uint32 config_names[VMA_MAX_CONFIGS]; /* offset into blob_buffer table */ + uint32 config_data[VMA_MAX_CONFIGS]; /* offset into blob_buffer table */ + + char _reserved2[4]; + + VmaDeviceInfoHeader dev_info[256]; +}; + +struct VmaExtentHeader { + char magic[4]; + uint16 reserved1; + uint16 block_count; + char uuid[16]; + char md5sum[16]; + uint64 blockinfo[VMA_BLOCKS_PER_EXTENT]; +}; +""" + +c_vma = cstruct(endian=">").load(vma_def) + + +VMA_MAGIC = b"VMA\x00" +VMA_EXTENT_MAGIC = b"VMAE" diff --git a/dissect/archive/exceptions.py b/dissect/archive/exceptions.py index 7f7fef8..ec1c764 100644 --- a/dissect/archive/exceptions.py +++ b/dissect/archive/exceptions.py @@ -6,11 +6,15 @@ class InvalidHeaderError(Error): pass -class NotADirectoryError(Error): +class FileNotFoundError(Error, FileNotFoundError): pass -class FileNotFoundError(Error): +class IsADirectoryError(Error, IsADirectoryError): + pass + + +class NotADirectoryError(Error, NotADirectoryError): pass diff --git a/dissect/archive/tools/__init__.py b/dissect/archive/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dissect/archive/tools/backup.py b/dissect/archive/tools/backup.py new file mode 100644 index 0000000..fc4fc5c --- /dev/null +++ b/dissect/archive/tools/backup.py @@ -0,0 +1,208 @@ +import argparse +import logging +import sys +from pathlib import Path + +from dissect.archive.c_vma import c_vma +from dissect.archive.vbk import VBK, DirItem +from dissect.archive.vma import VMA, _iter_mask + +try: + from rich.logging import RichHandler + from rich.progress import ( + BarColumn, + DownloadColumn, + Progress, + TextColumn, + TimeRemainingColumn, + TransferSpeedColumn, + ) + + progress = Progress( + TextColumn("[bold blue]{task.fields[filename]}", justify="right"), + BarColumn(bar_width=None), + "[progress.percentage]{task.percentage:>3.1f}%", + "•", + DownloadColumn(), + "•", + TransferSpeedColumn(), + "•", + TimeRemainingColumn(), + transient=True, + ) +except ImportError: + RichHandler = logging.StreamHandler + + class Progress: + def __init__(self): + self.filename = None + self.total = None + + self._task_id = 0 + self._info = {} + + def __enter__(self): + pass + + def __exit__(self, *args, **kwargs) -> None: + sys.stderr.write("\n") + sys.stderr.flush() + + def add_task(self, name: str, filename: str, total: int, **kwargs) -> int: + task_id = self._task_id + self._task_id += 1 + + self._info[task_id] = {"filename": filename, "total": total, "position": 0} + + return task_id + + def update(self, task_id: int, advance: int) -> None: + self._info[task_id]["position"] += advance + self.draw() + + def draw(self) -> None: + infos = [] + for info in self._info.values(): + infos.append(f"{info['filename']} {(info['position'] / info['total']) * 100:0.2f}%") + sys.stderr.write("\r" + " | ".join(infos)) + sys.stderr.flush() + + progress = Progress() + + +log = logging.getLogger(__name__) + + +def setup_logging(logger: logging.Logger, verbosity: int) -> None: + if verbosity == 1: + level = logging.ERROR + elif verbosity == 2: + level = logging.WARNING + elif verbosity == 3: + level = logging.INFO + elif verbosity >= 4: + level = logging.DEBUG + else: + level = logging.CRITICAL + + handler = RichHandler() + handler.setFormatter(logging.Formatter("%(message)s")) + handler.setLevel(level) + logger.addHandler(handler) + logger.setLevel(level) + + +def extract_vma(vma: VMA, out_dir: Path) -> None: + log.info("Extracting config files") + for config_name, config_data in vma.configs().items(): + out_file = out_dir.joinpath(config_name) + + log.info("%s -> %s (%d bytes)", config_name, out_file, len(config_data)) + out_file.write_bytes(config_data) + + log.info("Extracting device data") + tasks = {} + handles = {} + for device in vma.devices(): + task_id = progress.add_task("extract", filename=device.name, total=device.size) + tasks[device.id] = task_id + handles[device.id] = out_dir.joinpath(device.name).open("wb") + + with progress: + try: + for extent in vma.extents(): + vma.fh.seek(extent.data_offset) + for block_info in extent.header.blockinfo: + cluster_num = block_info & 0xFFFFFFFF + dev_id = (block_info >> 32) & 0xFF + mask = block_info >> (32 + 16) + + if dev_id == 0: + continue + + fh_out = handles[dev_id] + fh_out.seek(cluster_num * c_vma.VMA_CLUSTER_SIZE) + + if mask == 0xFFFF: + fh_out.write(vma.fh.read(c_vma.VMA_CLUSTER_SIZE)) + elif mask == 0: + fh_out.write(b"\x00" * c_vma.VMA_CLUSTER_SIZE) + else: + for allocated, count in _iter_mask(mask, 16): + if allocated: + fh_out.write(vma.fh.read(count * c_vma.VMA_BLOCK_SIZE)) + else: + fh_out.write(b"\x00" * count * c_vma.VMA_BLOCK_SIZE) + + progress.update(tasks[dev_id], advance=c_vma.VMA_CLUSTER_SIZE) + except Exception as e: + log.exception("Exception during extraction") + log.debug("", exc_info=e) + finally: + for handle in handles.values(): + handle.close() + + +def extract_vbk(vbk: VBK, out_dir: Path) -> None: + def extract_directory(directory: DirItem, out_dir: Path) -> None: + out_dir.mkdir(exist_ok=True) + for entry in directory.iterdir(): + out_path = out_dir.joinpath(entry.name) + if entry.is_dir(): + extract_directory(entry, out_path) + else: + task_id = progress.add_task("extract", filename=entry.name, total=entry.size) + with entry.open() as fh_in, out_path.open("wb") as fh_out: + for chunk in iter(lambda: fh_in.read(vbk.block_size), b""): + fh_out.write(chunk) + progress.update(task_id, advance=len(chunk)) + + with progress: + try: + extract_directory(vbk.get("/"), out_dir) + except Exception as e: + log.exception("Exception during extraction") + log.debug("", exc_info=e) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Hypervisor backup extractor") + parser.add_argument("input", type=Path, help="path to backup file") + parser.add_argument("-o", "--output", type=Path, required=True, help="path to output directory") + parser.add_argument("-v", "--verbose", action="count", default=3, help="increase output verbosity") + args = parser.parse_args() + + setup_logging(log, args.verbose) + + in_file = args.input.resolve() + if not in_file.exists(): + log.error("Input file does not exist: %s", in_file) + parser.exit() + + out_dir = args.output.resolve() + if not out_dir.exists(): + log.error("Output path does not exist: %s", out_dir) + parser.exit() + + if not out_dir.is_dir(): + log.error("Output path is not a directory: %s", out_dir) + parser.exit() + + with in_file.open("rb") as fh: + for klass, extract in ((VMA, extract_vma), (VBK, extract_vbk)): + try: + backup = klass(fh) + extract(backup, out_dir) + break + except Exception as e: + log.debug("Failed to extract using %s", klass.__name__, exc_info=e) + else: + log.error("Unknown backup format") + parser.exit() + + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + pass diff --git a/dissect/archive/vbk.py b/dissect/archive/vbk.py new file mode 100644 index 0000000..5a6cc3b --- /dev/null +++ b/dissect/archive/vbk.py @@ -0,0 +1,1071 @@ +# References: +# - Veeam extract utility +# - Veeam agent +from __future__ import annotations + +from functools import cached_property, lru_cache +from io import BytesIO +from typing import BinaryIO, Generic, Iterator, TypeVar +from zlib import crc32 + +from dissect.cstruct import Structure +from dissect.util.compression import lz4 +from dissect.util.crc32c import crc32c +from dissect.util.stream import AlignedStream +from dissect.util.xmemoryview import xmemoryview + +from dissect.archive.c_vbk import PAGE_SIZE, c_vbk +from dissect.archive.exceptions import Error + + +class VBKError(Error): + pass + + +class NotAFileError(VBKError): + pass + + +class NotADirectoryError(VBKError): + pass + + +class VBK: + """Veeam Backup (VBK) file implementation. + + References: + - CMeta + - CStgFormat + + Notes: + - **TODO**: Encryption + - **TODO**: Incrememental backups + + Args: + fh: The file handle of the VBK file to read. + verify: Whether to verify checksums. + """ + + def __init__(self, fh: BinaryIO, verify: bool = True): + self.fh = fh + + fh.seek(0) + self.header = c_vbk.StorageHeader(fh) + + self.format_version = self.header.FormatVersion + self.block_size = self.header.StandardBlockSize + + # First slot starts at PAGE_SIZE because StorageHeader is considered to be PAGE_SIZE large + self.slot1 = SnapshotSlot(self, PAGE_SIZE) + # Second slot starts at PAGE_SIZE + slot1 size + self.slot2 = SnapshotSlot(self, PAGE_SIZE + self.slot1.size) + + populated_slots = filter(lambda slot: slot.header.ContainsSnapshot, (self.slot1, self.slot2)) + + if verify: + populated_slots = filter(lambda slot: slot.verify(), populated_slots) + + if not (active_slot := max(populated_slots, key=lambda slot: slot.descriptor.Version, default=None)): + raise VBKError("No active VBK metadata slot found") + + self.active_slot: SnapshotSlot = active_slot + + self.root = RootDirectory( + self, + self.active_slot.descriptor.DirectoryRoot.RootPage, + self.active_slot.descriptor.DirectoryRoot.Count, + ) + self.block_store = MetaVector( + self, + StgBlockDescriptorV7 if self.is_v7 else StgBlockDescriptor, + self.active_slot.descriptor.BlocksStore.RootPage, + self.active_slot.descriptor.BlocksStore.Count, + ) + + def is_v7(self) -> bool: + return self.format_version == 7 or self.format_version == 0x10008 or self.format_version >= 9 + + def page(self, idx: int) -> bytes: + """Read a page from the VBK file. + + Args: + idx: The index of the page to read. + """ + return self.active_slot.page(idx) + + def get_meta_blob(self, page: int) -> MetaBlob: + """Read a meta blob from the VBK file. + + Args: + page: The starting page number of the meta blob to read. + """ + return self.active_slot._get_meta_blob(page) + + def get(self, path: str, item: DirItem | None = None) -> DirItem: + """Get a directory item from the VBK file.""" + item = item or self.root + + for part in path.split("/"): + if not part: + continue + + for entry in item.iterdir(): + if entry.name == part: + item = entry + break + else: + raise FileNotFoundError(f"File not found: {path}") + + return item + + +class SnapshotSlot: + """A snapshot slot in the VBK file. + + References: + - CSlotHdr + - SSnapshotDescriptor + - CSnapshotSlot + - CMetaStore + - CMetaObjs + - SMetaObjRefs + - SDirRootRec + - SBlocksStoreHdr + + Notes: + - **TODO**: Free blocks index (CFreeBlocksIndex, SFreeBlockIndexItem) + - **TODO**: Deduplication index (CDedupIndex, SDedupIndexItem) + - **TODO**: Crypto store (CCryptoStore, SCryptoStoreRec) + + Args: + vbk: The VBK object that the snapshot slot is part of. + offset: The offset of the snapshot slot in the file. + """ + + def __init__(self, vbk: VBK, offset: int): + self.vbk = vbk + self.offset = offset + + self.vbk.fh.seek(offset) + self.header = c_vbk.SnapshotSlotHeader(vbk.fh) + self.descriptor = None + self.grain = None + self.valid_max_banks = 0 + self.banks = [] + + if self.header.ContainsSnapshot: + self.descriptor = c_vbk.SnapshotDescriptor(vbk.fh) + self.grain = c_vbk.BanksGrain(vbk.fh) + + self.valid_max_banks = 0xF8 if self.vbk.header.SnapshotSlotFormat == 0 else 0x7F00 + + if self.grain.MaxBanks > self.valid_max_banks: + raise VBKError("Invalid SnapshotSlot: MaxBanks is not valid") + if self.grain.StoredBanks > self.grain.MaxBanks: + raise VBKError("Invalid SnapshotSlot: StoredBanks is greater than MaxBanks") + + self.banks = [ + Bank(self.vbk, entry.Offset, entry.Size) + for entry in c_vbk.BankDescriptor[self.grain.StoredBanks](vbk.fh) + ] + + def __repr__(self) -> str: + return f"" + + @cached_property + def size(self) -> int: + """The size of the snapshot slot in the file.""" + slot_size = len(c_vbk.SnapshotSlotHeader) + len(c_vbk.SnapshotDescriptor) + if self.header.ContainsSnapshot: + slot_size += self.grain.MaxBanks * len(c_vbk.BankDescriptor) + else: + slot_size += self.valid_max_banks * len(c_vbk.BankDescriptor) + + if slot_size & (PAGE_SIZE - 1): + # Round to next page boundary + slot_size = (slot_size & ~(PAGE_SIZE - 1)) + PAGE_SIZE + + return slot_size + + def verify(self) -> bool: + """Verify the snapshot slot's CRC. + + Args: + crc: The CRC to verify against. + """ + if not self.header.ContainsSnapshot: + return False + + crc = crc32c if self.vbk.header.SnapshotSlotFormat > 5 else crc32 + + # Remainder of SnapshotSlotHeader + SnapshotDescriptor + BanksGrain + length = 4 + len(c_vbk.SnapshotDescriptor) + 8 + self.grain.MaxBanks * len(c_vbk.BankDescriptor) + + self.vbk.fh.seek(self.offset + 4) # Skip CRC + return crc(self.vbk.fh.read(length)) == self.header.CRC + + def page(self, page: int) -> bytes: + """Read a page from the snapshot slot. + + Args: + idx: The page number to read. + """ + return self.banks[page >> 32].page(page & 0xFFFFFFFF) + + def _get_meta_blob(self, page: int) -> MetaBlob: + """Get a meta blob from the snapshot slot. + + Args: + page: The page of the first page in the meta blob. + """ + return MetaBlob(self, page) + + +class Bank: + """A bank in the snapshot slot. A bank is a collection of pages. + + References: + - SBankHdr + - CBankHdrPage + + Args: + vbk: The VBK object that the bank is part of. + offset: The offset of the bank in the file. + size: The size of the bank in the file. + """ + + def __init__(self, vbk: VBK, offset: int, size: int): + self.vbk = vbk + self.offset = offset + self.size = size + + self.vbk.fh.seek(offset) + self.header = c_vbk.BankHeader(vbk.fh) + + self.page = lru_cache(128)(self.page) + + def __repr__(self) -> str: + return f"" + + def verify(self, crc: int) -> bool: + """Verify the bank's CRC. + + Args: + crc: The CRC to verify against. + """ + crc = crc32c if self.vbk.format_version >= 12 and self.vbk.format_version != 0x10008 else crc32 + + self.vbk.fh.seek(self.offset) + return crc(self.vbk.fh.read(self.size)) == crc + + def page(self, page: int) -> memoryview: + """Read a page from the bank. + + Args: + page: The page number to read. + """ + # Data starts at PAGE_SIZE from bank offset + self.vbk.fh.seek(self.offset + PAGE_SIZE + (page * PAGE_SIZE)) + return memoryview(self.vbk.fh.read(PAGE_SIZE)) + + +class MetaItem: + """Base class for value types in a meta vector.""" + + __struct__: Structure = None + + def __init__(self, vbk: VBK, buf: bytes): + self.vbk = vbk + self.buf = buf + self.entry = None + + if self.__struct__: + self.entry = self.__struct__(buf) + + @classmethod + def from_bytes(cls, vbk: VBK, buf: bytes) -> MetaItem: + return cls(vbk, buf) + + +class DirItem(MetaItem): + """Base class for directory items. + + References: + - SDirItemRec + - CDir + """ + + __struct__ = c_vbk.DirItemRecord + + def __init__(self, vbk: VBK, buf: bytes): + super().__init__(vbk, buf) + self.name = self.entry.Name[: self.entry.NameLength].decode("utf-8") + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} name={self.name!r}>" + + @classmethod + def from_bytes( + cls, vbk: VBK, buf: bytes + ) -> SubFolderItem | ExtFibItem | IntFibItem | PatchItem | IncrementItem | DirItem: + cls_map = { + c_vbk.DirItemType.SubFolder: SubFolderItem, + c_vbk.DirItemType.ExtFib: ExtFibItem, + c_vbk.DirItemType.IntFib: IntFibItem, + c_vbk.DirItemType.Patch: PatchItem, + c_vbk.DirItemType.Increment: IncrementItem, + } + + type = c_vbk.DirItemType(buf[:4]) + return cls_map.get(type, cls)(vbk, buf) + + @cached_property + def type(self) -> c_vbk.DirItemType: + """The type of the directory item.""" + return self.entry.Type + + @cached_property + def size(self) -> int: + raise VBKError(f"Size not available for {self!r}") + + @cached_property + def properties(self) -> PropertiesDictionary | None: + """The properties of the directory item, if it has them.""" + if self.entry.PropsRootPage == -1: + return None + + return PropertiesDictionary(self.vbk, self.entry.PropsRootPage) + + def is_dir(self) -> bool: + """Return whether the directory item is a directory.""" + return False + + def is_file(self) -> bool: + """Return whether the directory item is a file.""" + return self.is_internal_file() or self.is_external_file() + + def is_internal_file(self) -> bool: + """Return whether the directory item is an internal file.""" + return False + + def is_external_file(self) -> bool: + """Return whether the directory item is an external file.""" + return False + + def listdir(self) -> dict[str, DirItem]: + """Return a dictionary of the items in the directory.""" + return {item.name: item for item in self.iterdir()} + + def iterdir(self) -> Iterator[DirItem]: + """Iterate over the items in the directory.""" + raise NotADirectoryError(f"{self!r} is not a directory") + + def open(self) -> BinaryIO: + """Open the file for reading.""" + raise NotAFileError(f"{self!r} is not a file") + + +class RootDirectory(DirItem): + """Special directory item for the root directory. Does not actually exist in the VBK file.""" + + def __init__(self, vbk: VBK, page: int, count: int): + super().__init__(vbk, b"\x00" * len(c_vbk.DirItemRecord)) + self.name = "/" + self.root = page + self.count = count + + def __repr__(self) -> str: + return f"" + + def is_dir(self) -> bool: + return True + + def iterdir(self) -> Iterator[DirItem]: + yield from MetaVector(self.vbk, DirItem, self.root, self.count) + + +class SubFolderItem(DirItem): + """Directory item for a subfolder (directory type). + + References: + - CSubFolderHdr + - CFolderMeta + """ + + def __init__(self, vbk: VBK, buf: bytes): + super().__init__(vbk, buf) + self.root = self.entry.SubFolder.RootPage + self.count = self.entry.SubFolder.Count + + def __repr__(self) -> str: + return f"" + + def is_dir(self) -> bool: + return True + + def iterdir(self) -> Iterator[DirItem]: + yield from MetaVector(self.vbk, DirItem, self.root, self.count) + + +class ExtFibItem(DirItem): + """Directory item for an external file. + + References: + - SFibHdr + - CExtFibMeta + """ + + def __repr__(self) -> str: + return f"" + + @cached_property + def size(self) -> int: + return self.entry.ExtFib.FibSize + + def is_external_file(self) -> bool: + return True + + +class IntFibItem(DirItem): + """Directory item for an internal file. + + References: + - SFibHdr + - CIntFibMeta + """ + + def __init__(self, vbk: VBK, buf: bytes): + super().__init__(vbk, buf) + + def __repr__(self) -> str: + return f"" + + @cached_property + def size(self) -> int: + return self.entry.IntFib.FibSize + + def is_internal_file(self) -> bool: + return True + + def open(self) -> FibStream: + return FibStream( + self.vbk, + self.entry.IntFib.BlocksVector.RootPage, + self.entry.IntFib.BlocksVector.Count, + self.size, + ) + + +class PatchItem(DirItem): + """Directory item for a patch. + + Notes: + - **TODO**: SPatchHdr + - **TODO**: CPatchMeta + """ + + def __repr__(self) -> str: + return f"" + + @cached_property + def size(self) -> int: + return self.entry.Patch.FibSize + + +class IncrementItem(DirItem): + """Directory item for an increment. + + Notes: + - **TODO**: SIncrementHdr + - **TODO**: CIncrementMeta + """ + + def __repr__(self) -> str: + return f"" + + @cached_property + def size(self) -> int: + return self.entry.Increment.FibSize + + +class MetaTableDescriptor(MetaItem): + """A descriptor for a meta table in the VBK file. + + References: + - SMetaTableDescriptor + """ + + __struct__ = c_vbk.MetaTableDescriptor + + def __repr__(self) -> str: + return f"" + + @cached_property + def page(self) -> int: + """The page number of the first page in the meta table.""" + return self.entry.RootPage + + @cached_property + def block_size(self) -> int: + """The block size of the meta table.""" + return self.entry.BlockSize + + @cached_property + def count(self) -> int: + """The number of entries in the meta table.""" + return self.entry.Count + + +class FibBlockDescriptor(MetaItem): + """A descriptor for a FIB (File In Backup) block in the VBK file. + + References: + - SFibBlockDescriptor + """ + + __struct__ = c_vbk.FibBlockDescriptor + + def __repr__(self) -> str: + return f"" + + def is_normal(self) -> bool: + """Return whether the block is a normal block.""" + return self.type == c_vbk.BlockLocationType.Normal + + def is_sparse(self) -> bool: + """Return whether the block is a sparse block.""" + return self.type == c_vbk.BlockLocationType.Sparse + + def is_reserved(self) -> bool: + """Return whether the block is a reserved block.""" + return self.type == c_vbk.BlockLocationType.Reserved + + def is_archived(self) -> bool: + """Return whether the block is an archived block. + + If the block is archived, the compressed size and compression type are stored in the block ID:: + + BlockId = CompressedSize | (CompressionType << 32) + + Notes: + - **TODO**: Verify the above + """ + return self.type == c_vbk.BlockLocationType.Archived + + def is_block_in_blob(self) -> bool: + """Return whether the block is a block in a blob. + + If the block is in a blob, the block ID, blob ID and offset are stored in the block ID:: + + BlockId = BlockId? & 0x3FFFFFF | (BlobId << 26) | ((Offset >> 9) << 42) + + Notes: + - **TODO**: Verify the above + """ + return self.type == c_vbk.BlockLocationType.BlockInBlob + + def is_block_in_blob_reserved(self) -> bool: + """Return whether the block is a reserved block in a blob. + + If the block is a reserved block in a blob, the block ID is stored in the block ID:: + + BlockId = BlockId? | 0xFFFFFFFFFC000000 + + Notes: + - **TODO**: Verify the above + """ + return self.type == c_vbk.BlockLocationType.BlockInBlobReserved + + @cached_property + def block_size(self) -> int: + """The size of the block.""" + return self.entry.BlockSize + + @cached_property + def type(self) -> c_vbk.BlockLocationType: + """The type of the block.""" + return self.entry.Type + + @cached_property + def digest(self) -> bytes: + """The digest of the block.""" + return self.entry.Digest + + @cached_property + def block_id(self) -> int: + """The ID of the block.""" + return self.entry.BlockId + + @cached_property + def flags(self) -> c_vbk.BlockFlags: + """The flags of the block.""" + return self.entry.Flags + + +class FibBlockDescriptorV7(FibBlockDescriptor): + """A descriptor for a FIB (File In Backup) block in the VBK file. Version 7. + + References: + - SFibBlockDescriptorV7 + """ + + __struct__ = c_vbk.FibBlockDescriptorV7 + + def __repr__(self) -> str: + return f"" + + @cached_property + def keyset_id(self) -> bytes: + return self.entry.KeySetId + + +class StgBlockDescriptor(MetaItem): + """A descriptor for a storage block in the VBK file. + + References: + - SStgBlockDescriptor + """ + + __struct__ = c_vbk.StgBlockDescriptor + + def __repr__(self) -> str: + return ( + f"" + ) + + def is_legacy(self) -> bool: + """Return whether the block is a legacy block.""" + return self.format != 4 + + def is_data_block(self) -> bool: + """Return whether the block is a data block. + + A data block is a block that has a usage counter greater than 0. + """ + return self.usage_counter != 0 + + def is_dedup_block(self) -> bool: + """Return whether the block is a dedup block. + + Notes: + - **TODO**: What is this? + """ + return self.deduplication != 0 + + def is_compressed(self) -> bool: + """Return whether the block is compressed.""" + return self.compression_type != c_vbk.CompressionType.Plain + + @cached_property + def format(self) -> int: + """The format of the block.""" + return self.entry.Format + + @cached_property + def usage_counter(self) -> int: + """The usage counter of the block.""" + return self.entry.UsageCounter + + @cached_property + def offset(self) -> int: + """The offset of the block.""" + return self.entry.Offset + + @cached_property + def allocated_size(self) -> int: + """The allocated size of the block.""" + return self.entry.AllocatedSize + + @cached_property + def deduplication(self) -> int: + """The deduplication of the block.""" + return self.entry.Deduplication + + @cached_property + def digest(self) -> bytes: + """The digest of the block.""" + return self.entry.Digest + + @cached_property + def compression_type(self) -> c_vbk.CompressionType: + """The compression type of the block.""" + return self.entry.CompressionType + + @cached_property + def compressed_size(self) -> int: + """The compressed size of the block.""" + return self.entry.CompressedSize + + @cached_property + def source_size(self) -> int: + """The source size of the block.""" + return self.entry.SourceSize + + +class StgBlockDescriptorV7(StgBlockDescriptor): + """A descriptor for a storage block in the VBK file. Version 7. + + References: + - SStgBlockDescriptorV7 + """ + + __struct__ = c_vbk.StgBlockDescriptorV7 + + def __repr__(self) -> str: + return ( + f"" + ) + + @cached_property + def keyset_id(self) -> bytes: + """The keyset ID of the block.""" + return self.entry.KeySetId + + +class PropertiesDictionary(dict): + """A dictionary of properties in the VBK file. + + References: + - CPropsDictionary + - CDirElemPropsRW + + Args: + vbk: The VBK object that the properties dictionary is part of. + page: The page number of the meta blob of the properties dictionary. + """ + + def __init__(self, vbk: VBK, page: int): + self.vbk = vbk + self.page = page + + buf = BytesIO(self.vbk.get_meta_blob(page).data()) + buf.seek(len(c_vbk.MetaBlobHeader)) + + while True: + value_type = c_vbk.PropertyType(buf) + if value_type == c_vbk.PropertyType.End: + break + + name_length = c_vbk.uint32(buf) + name = buf.read(name_length).decode("utf-8") + + if value_type == c_vbk.PropertyType.UInt32: + value = c_vbk.uint32(buf) + elif value_type == c_vbk.PropertyType.UInt64: + value = c_vbk.uint64(buf) + elif value_type == c_vbk.PropertyType.AString: + value = buf.read(c_vbk.uint32(buf)).decode("utf-8") + elif value_type == c_vbk.PropertyType.WString: + value = buf.read(c_vbk.uint32(buf)).decode("utf-16-le") + elif value_type == c_vbk.PropertyType.Binary: + value = buf.read(c_vbk.uint32(buf)) + elif value_type == c_vbk.PropertyType.Boolean: + value = bool(c_vbk.uint32(buf)) + else: + raise VBKError(f"Unsupported property type: {value_type}") + + self[name] = value + + +class MetaBlob: + """A meta blob in the VBK file. + + A meta blob is a list of pages that are linked together. Each page has a header (``MetaBlobHeader``) with + a ``NextPage`` field that points to the next page in the blob. The last page has a ``NextPage`` field of -1. + + References: + - CMetaBlobRW + + Args: + slot: The snapshot slot that the meta blob is part of. + root: The page number of the first page in the meta blob. + """ + + def __init__(self, slot: SnapshotSlot, root: int): + self.slot = slot + self.root = root + + def __repr__(self) -> str: + return f"" + + def _read(self) -> Iterator[int, memoryview]: + page = self.root + + while page != -1: + buf = self.slot.page(page) + yield page, buf + + page = int.from_bytes(buf[:8], "little", signed=True) + + def pages(self) -> Iterator[int]: + return (page for page, _ in self._read()) + + def data(self) -> bytes: + return b"".join(buf for _, buf in self._read()) + + +T = TypeVar("T", bound=MetaItem) + + +class MetaVector(Generic[T]): + """A vector of meta items in the VBK file. + + References: + - CMetaVec + + Args: + vbk: The VBK object that the vector is part of. + type_: The type of the items in the vector. + page: The page number of the first page in the vector. + count: The number of items in the vector. + """ + + def __new__(cls, vbk: VBK, *args, **kwargs): + if vbk.format_version >= 12 and vbk.format_version != 0x10008: + cls = MetaVector2 + return super().__new__(cls) + + def __init__(self, vbk: VBK, type_: type[T], page: int, count: int): + self.vbk = vbk + self.type = type_ + self.page = page + self.count = count + + self._entry_size = len(self.type.__struct__) + self._entries_per_page = PAGE_SIZE // self._entry_size + self._pages = list(self.vbk.get_meta_blob(page).pages()) + + self.get = lru_cache(128)(self.get) + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} type={self.type.__name__} count={self.count}>" + + def __iter__(self) -> Iterator[T]: + return (self.get(i) for i in range(self.count)) + + def data(self, idx: int) -> bytes: + """Read the data for an entry in the vector. + + Args: + idx: The index of the entry to read. + """ + page_id, offset = divmod(idx, self._entries_per_page) + page = self._pages[page_id] + offset = (offset * self._entry_size) + 8 + + buf = self.vbk.page(page) + entry = buf[offset : offset + self._entry_size] + return entry + + def get(self, idx: int) -> T: + """Get an entry from the vector. + + Args: + idx: The index of the entry to get. + """ + if idx >= self.count: + raise IndexError("MetaVector2 index out of range") + return self.type.from_bytes(self.vbk, self.data(idx)) + + +class MetaVector2(MetaVector[T]): + """A vector of meta items in the VBK file. Version 2. + + MetaVector2 is essentially a table of page numbers that contain the vector entries. + The table pages of a MetaVector2 have a 8-32 byte header, so we can hold a maximum of 508-511 entries per page. + Read the comments in _lookup_page for more information. + + References: + - CMetaVec2 + + Args: + vbk: The VBK object that the vector is part of. + type_: The type of the items in the vector. + page: The page number of the first page in the vector. + count: The number of items in the vector. + """ + + _MAX_TABLE_ENTRIES_PER_PAGE = PAGE_SIZE // 8 + _MAX_TABLE_ENTRIES_LOOKUP = ( + _MAX_TABLE_ENTRIES_PER_PAGE - 1, + _MAX_TABLE_ENTRIES_PER_PAGE - 4, + _MAX_TABLE_ENTRIES_PER_PAGE - 1, + ) + + def __init__(self, vbk: VBK, type_: type[T], page: int, count: int): + super().__init__(vbk, type_, page, count) + + # It's not actually a meta blob, but the same mechanism is used (next page pointer in the header) + # The table itself is essentially a big array of 64 bit integers, so cast it to a memoryview of that + self._table = xmemoryview(self.vbk.get_meta_blob(page).data(), " int: + """Look up the page number for an entry in the vector. + + Args: + idx: The page index to lookup the page number for. + """ + + # MetaVec2 pages are a little special + # The first page has a 16 byte header: + # - 8 bytes for the next page number + # - 8 bytes for the root page number + # The second page has a 32 byte header: + # - 8 bytes for the next page number + # - 8 bytes for the second page number (this page) + # - 8 bytes for the third page number + # - 8 bytes for the fourth page number + # The third and fourth pages only have a 8 byte header: + # - 8 bytes for the next page number + # The fifth page has a 32 byte header again containing the next 3 page numbers + # We've not seen a table large enough to see this repeat more than once, but presumably it does + # + # This means that the first page can hold 510 entries, the second 508, and the third and fourth 511 each + # The fifth page can hold 508 entries again, and so on + + if idx < self._MAX_TABLE_ENTRIES_PER_PAGE - 2: + return self._table[idx + 2] # Skip the header + + idx -= self._MAX_TABLE_ENTRIES_PER_PAGE - 2 + table_idx = 1 + while True: + max_entries = self._MAX_TABLE_ENTRIES_LOOKUP[table_idx % 3] + + if idx < max_entries: + table_offset = table_idx * self._MAX_TABLE_ENTRIES_PER_PAGE + return self._table[table_offset + (self._MAX_TABLE_ENTRIES_PER_PAGE - max_entries) + idx] + + idx -= max_entries + table_idx += 1 + + def data(self, idx: int) -> bytes: + """Read the data for an entry in the vector. + + Args: + idx: The index of the entry to read. + """ + page_idx, offset = divmod(idx, self._entries_per_page) + offset *= self._entry_size + + page_no = self._lookup_page(page_idx) + return self.vbk.page(page_no)[offset : offset + self._entry_size] + + +class FibMetaSparseTable: + """A sparse table of FIB (File In Backup) blocks in the VBK file. + + References: + - CFibMetaSparseTable + + Args: + vbk: The VBK object that the sparse table is part of. + page: The page number of the first page in the table. + count: The number of entries in the table. + """ + + # This seems hardcoded? Probably calculated from something but unknown for now + MAX_ENTRIES_PER_TABLE = 1088 + + def __init__(self, vbk: VBK, page: int, count: int): + self.vbk = vbk + self.page = page + self.count = count + + # Newer versions use a different block descriptor + self.type = FibBlockDescriptorV7 if self.vbk.is_v7() else FibBlockDescriptor + self._fake_sparse = self.type( + self.vbk, + self.type.__struct__( + BlockSize=self.vbk.block_size, + Type=c_vbk.BlockLocationType.Sparse, + ).dumps(), + ) + + self._table_count = (count + self.MAX_ENTRIES_PER_TABLE - 1) // self.MAX_ENTRIES_PER_TABLE + self._vec = MetaVector(vbk, MetaTableDescriptor, page, self._table_count) + + self._open_table = lru_cache(128)(self._open_table) + + def _open_table(self, page: int, count: int) -> MetaVector[FibBlockDescriptor | FibBlockDescriptorV7]: + return MetaVector(self.vbk, self.type, page, count) + + def get(self, idx: int) -> FibBlockDescriptor | FibBlockDescriptorV7: + """Get a block descriptor from the sparse table. + + Args: + idx: The index of the block descriptor to get. + """ + if idx >= self.count: + raise IndexError("MetaSparseTable index out of range") + + table_idx, entry_idx = divmod(idx, self.MAX_ENTRIES_PER_TABLE) + + table_entry = self._vec.get(table_idx) + if table_entry.page == -1: + return self._fake_sparse + + return self._open_table(table_entry.page, table_entry.count).get(entry_idx) + + +class FibStream(AlignedStream): + """A stream for reading FIB (File In Backup) blocks in the VBK file. + + Args: + vbk: The VBK object that the stream is part of. + page: The page number of the :class:`FibMetaSparseTable`. + count: The number of entries in the meta sparse table. + size: The size of the stream. + """ + + def __init__(self, vbk: VBK, page: int, count: int, size: int): + self.vbk = vbk + self.page = page + self.count = count + + self.table = FibMetaSparseTable(vbk, page, count) + + super().__init__(size, align=vbk.block_size) + + def _read(self, offset: int, length: int) -> bytes: + result = [] + # TODO: Can the block size change per file? + block_size = self.vbk.block_size + + while length > 0: + block_idx = offset // block_size + offset_in_block = offset % block_size + + read_size = min(length, block_size - offset_in_block) + + block_desc = self.table.get(block_idx) + + if block_desc.is_normal(): + block = self.vbk.block_store.get(block_desc.block_id) + + self.vbk.fh.seek(block.offset) + buf = self.vbk.fh.read(block.compressed_size) + + if block.is_compressed(): + if block.compression_type == c_vbk.CompressionType.LZ4: + # First 12 bytes are Lz4BlockHeader + buf = lz4.decompress(memoryview(buf)[12:], block.source_size) + else: + raise VBKError(f"Unsupported compression type: {block.compression_type}") + + result.append(buf[offset_in_block : offset_in_block + read_size]) + elif block_desc.is_sparse(): + result.append(b"\x00" * read_size) + else: + raise VBKError(f"Unsupported block type: {block_desc.type}") + + offset += read_size + length -= read_size + + return b"".join(result) diff --git a/dissect/archive/vma.py b/dissect/archive/vma.py new file mode 100644 index 0000000..2e820d3 --- /dev/null +++ b/dissect/archive/vma.py @@ -0,0 +1,269 @@ +# References: +# - https://git.proxmox.com/?p=pve-qemu.git;a=blob;f=vma_spec.txt +# - https://lists.gnu.org/archive/html/qemu-devel/2013-02/msg03667.html + +import hashlib +import struct +from collections import defaultdict +from functools import lru_cache +from uuid import UUID + +from dissect.util import ts +from dissect.util.stream import AlignedStream + +from dissect.archive.c_vma import VMA_EXTENT_MAGIC, VMA_MAGIC, c_vma +from dissect.archive.exceptions import InvalidHeaderError + + +class VMA: + """Proxmox VMA. + + Parse and provide a readable object for devices in a Proxmox VMA backup file. + VMA is designed to be streamed for extraction, so we need to do some funny stuff to create a readable + object from it. Performance is not optimal, so it's generally advised to extract a VMA instead. + The ``vma-extract`` utility can be used for that. + """ + + def __init__(self, fh): + self.fh = fh + + fh.seek(0) + self.header = c_vma.VmaHeader(fh) + if self.header.magic != VMA_MAGIC: + raise InvalidHeaderError("Invalid VMA header magic") + + fh.seek(0) + header_data = bytearray(fh.read(self.header.header_size)) + header_data[32:48] = b"\x00" * 16 + if hashlib.md5(header_data).digest() != self.header.md5sum: + raise InvalidHeaderError("Invalid VMA checksum") + + self.version = self.header.version + self.uuid = UUID(bytes=self.header.uuid) + + blob_start = self.header.blob_buffer_offset + blob_end = self.header.blob_buffer_offset + self.header.blob_buffer_size + self._blob = memoryview(bytes(header_data))[blob_start:blob_end] + + blob_offset = 1 + self._blob_data = {} + while blob_offset + 2 <= self.header.blob_buffer_size: + # The header is in big endian, but this is little... + size = struct.unpack("" + + def open(self): + return DeviceDataStream(self) + + +class Extent: + def __init__(self, fh, offset): + self.fh = fh + self.offset = offset + self.data_offset = offset + c_vma.VMA_EXTENT_HEADER_SIZE + + self.fh.seek(offset) + header_data = bytearray(fh.read(c_vma.VMA_EXTENT_HEADER_SIZE)) + self.header = c_vma.VmaExtentHeader(header_data) + if self.header.magic != VMA_EXTENT_MAGIC: + raise InvalidHeaderError("Invalid VMA extent header magic") + + header_data[24:40] = b"\x00" * 16 + if hashlib.md5(header_data).digest() != self.header.md5sum: + raise InvalidHeaderError("Invalid VMA extent checksum") + + self.uuid = UUID(bytes=self.header.uuid) + self.size = self.header.block_count * c_vma.VMA_BLOCK_SIZE + + # Keep track of the lowest and highest cluster we have for any device + # We can use this to speed up extent lookup later on + # There are at most 59 entries, so safe to parse ahead of use + self._min = {} + self._max = {} + self.blocks = defaultdict(list) + block_offset = self.data_offset + for block_info in self.header.blockinfo: + cluster_num = block_info & 0xFFFFFFFF + dev_id = (block_info >> 32) & 0xFF + mask = block_info >> (32 + 16) + + if dev_id == 0: + continue + + if dev_id not in self._min: + self._min[dev_id] = cluster_num + self._max[dev_id] = cluster_num + elif cluster_num < self._min[dev_id]: + self._min[dev_id] = cluster_num + elif cluster_num > self._max[dev_id]: + self._max[dev_id] = cluster_num + + self.blocks[dev_id].append((cluster_num, mask, block_offset)) + + if mask == 0xFFFF: + block_offset += 16 * c_vma.VMA_BLOCK_SIZE + elif mask == 0: + pass + else: + block_offset += bin(mask).count("1") * c_vma.VMA_BLOCK_SIZE + + def __repr__(self): + return f"" + + +class DeviceDataStream(AlignedStream): + def __init__(self, device): + self.device = device + self.vma = device.vma + super().__init__(size=device.size, align=c_vma.VMA_CLUSTER_SIZE) + + def _read(self, offset, length): + cluster_offset = offset // c_vma.VMA_CLUSTER_SIZE + cluster_count = (length + c_vma.VMA_CLUSTER_SIZE - 1) // c_vma.VMA_CLUSTER_SIZE + block_count = (length + c_vma.VMA_BLOCK_SIZE - 1) // c_vma.VMA_BLOCK_SIZE + + result = [] + for _, mask, block_offset in _iter_clusters(self.vma, self.device.id, cluster_offset, cluster_count): + read_count = min(block_count, 16) + + # Optimize reading fully set and fully sparse masks + if mask == 0xFFFF: + self.vma.fh.seek(block_offset) + result.append(self.vma.fh.read(c_vma.VMA_BLOCK_SIZE * read_count)) + elif mask == 0: + result.append(b"\x00" * read_count * c_vma.VMA_BLOCK_SIZE) + else: + self.vma.fh.seek(block_offset) + for allocated, count in _iter_mask(mask, read_count): + if allocated: + result.append(self.vma.fh.read(c_vma.VMA_BLOCK_SIZE * count)) + else: + result.append(b"\x00" * count * c_vma.VMA_BLOCK_SIZE) + + block_count -= read_count + if block_count == 0: + break + + return b"".join(result) + + +def _iter_clusters(vma, dev_id, cluster, count): + # Find clusters and starting offsets in all extents + temp = {} + end = cluster + count + + for extent in vma.extents(): + if dev_id not in extent.blocks: + continue + + if end < extent._min[dev_id] or cluster > extent._max[dev_id]: + continue + + for cluster_num, mask, block_offset in extent.blocks[dev_id]: + if cluster_num == cluster: + yield cluster_num, mask, block_offset + cluster += 1 + + while cluster in temp: + yield temp[cluster] + del temp[cluster] + cluster += 1 + elif cluster < cluster_num <= end: + temp[cluster_num] = (cluster_num, mask, block_offset) + + if cluster == end: + break + + if cluster == end: + break + + while cluster in temp: + yield temp[cluster] + del temp[cluster] + cluster += 1 + + +def _iter_mask(mask, length): + # Yield consecutive bitmask values + current_status = mask & 1 + current_count = 0 + + for bit_idx in range(length): + status = (mask & (1 << bit_idx)) >> bit_idx + if status == current_status: + current_count += 1 + else: + yield current_status, current_count + current_status = status + current_count = 1 + + if current_count: + yield current_status, current_count diff --git a/dissect/archive/xva.py b/dissect/archive/xva.py new file mode 100644 index 0000000..692891c --- /dev/null +++ b/dissect/archive/xva.py @@ -0,0 +1,136 @@ +import hashlib +import tarfile +from bisect import bisect_right +from xml.etree import ElementTree + +from dissect.util.stream import AlignedStream + +BLOCK_SIZE = 1024 * 1024 + + +class XVA: + """XVA reader. + + XenCenter export format. Basically a tar file with "blocks" of 1MB. + """ + + def __init__(self, fh): + # We don't have to cache tar members, tarfile already does that for us + self.tar = tarfile.open(fileobj=fh) + self._ova = None + + @property + def ova(self): + if not self._ova: + ova_member = self.tar.getmember("ova.xml") + ova_fh = self.tar.extractfile(ova_member) + self._ova = ElementTree.fromstring(ova_fh.read()) + return self._ova + + def disks(self): + return [ + el.text + for el in self.ova.findall( + "*//member/name[.='VDI']/../..//name[.='type']/..value[.='Disk']/../..//name[.='VDI']/../value" + ) + ] + + def open(self, ref, verify=False): + size = int( + self.ova.find(f"*//member/name[.='id']/../value[.='{ref}']/../..//name[.='virtual_size']/../value").text + ) + return XVAStream(self, ref, size, verify) + + +class XVAStream(AlignedStream): + """XVA stream. + + XenServer usually just streams an XVA file right into an output file, so our use-case requires a bit + more trickery. We generally don't stream directly into an output file, but try to create a file-like + object for other code to use. + + The numbers for the block files (weirdly) don't represent offsets. It's possible for a block file + to be 0 sized, in which case you should "add" that block to the stream, and continue on to the next. + The next block might have a number + 1 of what your current offset is, but it will still contain the + data for that current offset. For this reason we build a lookup list with offsets. + """ + + def __init__(self, xva, ref, size, verify=False): + self.xva = xva + self.ref = ref + self.verify = verify + + index = 0 + offset = 0 + self._lookup = [] + self._members = [] + for block_index, block_member, checksum_member in _iter_block_files(xva, ref): + if block_index > index + 1: + skipped = block_index - (index + 1) + offset += skipped * BLOCK_SIZE + + if block_member.size != 0: + self._lookup.append(offset) + self._members.append((block_member, checksum_member)) + + offset += block_member.size + + index = block_index + + super().__init__(size, align=BLOCK_SIZE) + + def _read(self, offset, length): + result = [] + + while length > 0: + # This method is probably sub-optimal, but it's fairly low effort and we rarely encounter XVA anyway + block_idx = bisect_right(self._lookup, offset) + nearest_offset = self._lookup[block_idx - 1] + + if offset >= nearest_offset + BLOCK_SIZE: + result.append(b"\x00" * BLOCK_SIZE) + else: + block_member, checksum_member = self._members[block_idx - 1] + buf = self.xva.tar.extractfile(block_member).read() + + if self.verify: + if checksum_member is None: + raise ValueError(f"No checksum for {block_member.name}") + + if ( + checksum_member.name.endswith("checksum") + and hashlib.sha1(buf).hexdigest() != self.xva.tar.extractfile(checksum_member).read().decode() + ): + raise ValueError(f"Invalid checksum for {checksum_member.name}") + else: + raise NotImplementedError(f"Unsupported checksum: {checksum_member.name}") + + result.append(buf) + + offset += BLOCK_SIZE + length -= BLOCK_SIZE + + return b"".join(result) + + +def _iter_block_files(xva, ref): + member_index = None + block_member = None + checksum_member = None + + for member in xva.tar.getmembers(): + if not member.name.startswith(ref): + continue + + index = int(member.name.split("/")[-1].split(".")[0]) + if member_index is None: + member_index = index + + if member_index != index: + yield (member_index, block_member, checksum_member) + member_index = index + + if member.name.endswith(("checksum", "xxhash")): + checksum_member = member + else: + block_member = member diff --git a/pyproject.toml b/pyproject.toml index d35e523..0abb486 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,11 +36,20 @@ documentation = "https://docs.dissect.tools/en/latest/projects/dissect.archive" repository = "https://github.com/fox-it/dissect.archive" [project.optional-dependencies] +full = [ + "rich", +] dev = [ + "dissect.archive[full]", "dissect.cstruct>=4.0.dev,<5.0.dev", "dissect.util>=3.0.dev,<4.0.dev", ] +[project.scripts] +vma-extract = "dissect.archive.tools.backup:main" +vbk-extract = "dissect.archive.tools.backup:main" +backup-extract = "dissect.archive.tools.backup:main" + [tool.black] line-length = 120 diff --git a/tests/conftest.py b/tests/conftest.py index 4e94103..2699def 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,3 +22,18 @@ def open_file_gz(name: str, mode: str = "rb") -> Iterator[BinaryIO]: @pytest.fixture def basic_wim() -> Iterator[BinaryIO]: yield from open_file_gz("data/basic.wim.gz") + + +@pytest.fixture +def basic_vma() -> Iterator[BinaryIO]: + yield from open_file_gz("data/test.vma.gz") + + +@pytest.fixture +def vbk9() -> Iterator[BinaryIO]: + yield from open_file_gz("data/test9.vbk.gz") + + +@pytest.fixture +def vbk13() -> Iterator[BinaryIO]: + yield from open_file_gz("data/test13.vbk.gz") diff --git a/tests/data/test.vma.gz b/tests/data/test.vma.gz new file mode 100644 index 0000000..3183050 Binary files /dev/null and b/tests/data/test.vma.gz differ diff --git a/tests/data/test13.vbk.gz b/tests/data/test13.vbk.gz new file mode 100644 index 0000000..876f872 --- /dev/null +++ b/tests/data/test13.vbk.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cba16c3dfaf067e17e6218ed643e94acd2e999e1e7bbdcb9ae6641245023cca2 +size 423179 diff --git a/tests/data/test9.vbk.gz b/tests/data/test9.vbk.gz new file mode 100644 index 0000000..a753d6c --- /dev/null +++ b/tests/data/test9.vbk.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ed2260f336c9f0b55de1e98e3c4bea15f239672b083d5167e40eeb3a50c182d +size 41177 diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py new file mode 100644 index 0000000..5038622 --- /dev/null +++ b/tests/test_exceptions.py @@ -0,0 +1,19 @@ +import pytest + +from dissect.archive import exceptions + + +@pytest.mark.parametrize( + "exc, std", + [ + (exceptions.FileNotFoundError, FileNotFoundError), + (exceptions.IsADirectoryError, IsADirectoryError), + (exceptions.NotADirectoryError, NotADirectoryError), + ], +) +def test_filesystem_error_subclass(exc: exceptions.Error, std: Exception) -> None: + assert issubclass(exc, std) + assert isinstance(exc(), std) + + with pytest.raises(std): + raise exc() diff --git a/tests/test_vbk.py b/tests/test_vbk.py new file mode 100644 index 0000000..f748b78 --- /dev/null +++ b/tests/test_vbk.py @@ -0,0 +1,125 @@ +import hashlib +import struct +from typing import BinaryIO +from unittest.mock import Mock + +from dissect.archive.vbk import ( + PAGE_SIZE, + VBK, + MetaBlob, + MetaTableDescriptor, + MetaVector, + MetaVector2, +) + + +def test_vbk_version_9(vbk9: BinaryIO) -> None: + vbk = VBK(vbk9) + + assert vbk.format_version == 9 + assert vbk.is_v7() + assert isinstance(vbk.block_store, MetaVector) + + assert vbk.root.is_dir() + assert not vbk.root.is_file() + assert list(vbk.get("/").listdir().keys()) == [ + "6745a759-2205-4cd2-b172-8ec8f7e60ef8 (78a5467d-87f5-8540-9a84-7569ae2849ad_2d1bb20f-49c1-485d-a689-696693713a5a)" # noqa: E501 + ] + + entry = vbk.get( + "6745a759-2205-4cd2-b172-8ec8f7e60ef8 (78a5467d-87f5-8540-9a84-7569ae2849ad_2d1bb20f-49c1-485d-a689-696693713a5a)" # noqa: E501 + ) + assert entry.is_dir() + assert not vbk.root.is_file() + assert list(entry.listdir().keys()) == [ + "DEV__dev_nvme1n1", + "summary.xml", + ] + + entry = vbk.get("DEV__dev_nvme1n1", entry) + assert not entry.is_dir() + assert entry.is_file() + assert entry.is_internal_file() + assert not entry.properties + assert entry.size == 0x400000 + + with entry.open() as fh: + digest = hashlib.sha256(fh.read()).hexdigest() + assert digest == "337350cac29d2ed34c23ce9fc675950badf85fd2b694791abe6999d36f0dc1b3" + + +def test_vbk_version_13(vbk13: BinaryIO) -> None: + vbk = VBK(vbk13) + + assert vbk.format_version == 13 + assert vbk.is_v7() + assert isinstance(vbk.block_store, MetaVector2) + assert list(vbk.get("/").listdir().keys()) == [ + "6745a759-2205-4cd2-b172-8ec8f7e60ef8 (3c834d56-37ac-8bd3-b946-30113c55c4b5)" + ] + + entry = vbk.get("6745a759-2205-4cd2-b172-8ec8f7e60ef8 (3c834d56-37ac-8bd3-b946-30113c55c4b5)") + assert entry.is_dir() + assert not entry.is_file() + assert list(entry.listdir().keys()) == [ + "digest_47d9f323-442b-433d-bd4f-1ecb3fa97351", + "8b14f74c-360d-4d7a-98f7-7f4c5e737eb7", + "GuestMembers.xml", + "BackupComponents.xml", + "summary.xml", + ] + + entry = vbk.get( + "6745a759-2205-4cd2-b172-8ec8f7e60ef8 (3c834d56-37ac-8bd3-b946-30113c55c4b5)/8b14f74c-360d-4d7a-98f7-7f4c5e737eb7" # noqa: E501 + ) + assert not entry.is_dir() + assert entry.is_file() + assert entry.is_internal_file() + assert "DefinedBlocksMask" in entry.properties + assert len(entry.properties["DefinedBlocksMask"]) == 35 + assert entry.size == 0x314200 + + with entry.open() as fh: + digest = hashlib.sha256(fh.read()).hexdigest() + assert digest == "e9ed281cf9c2fe1745e4eb9c926c1a64bd47569c48be511c5fdf6fd5793e5a77" + + +def test_metavector2_lookup() -> None: + """test that the lookup logic in MetaVector2 works as expected""" + + raw_blob = [] + entry = 0 + num_pages = 11 + for i in range(num_pages): + if i == 0: + # root page + header = struct.pack(" None: + vma = VMA(basic_vma) + + assert vma.version == 1 + assert str(vma.uuid) == "04fc12eb-0fed-4322-9aaa-f4e412f68096" + + assert vma.blob_string(1) == "qemu-server.conf" + assert len(vma.blob_data(20)) == 417 + assert vma.blob_string(439) == "drive-scsi0" + + assert vma.config("qemu-server.conf") == vma.blob_data(20) + assert len(vma.configs()) == 1 + + assert len(vma.devices()) == 1 + + device = vma.device(1) + assert device.id == 1 + assert device.name == "drive-scsi0" + assert device.size == 10737418240 + + extents = list(vma.extents()) + # The test data is just a small piece of a real VMA file + assert len(extents) == 2 + + assert list(_iter_clusters(vma, device.id, 0, 23)) == [ + (0, 65535, 13312), + (1, 0, 78848), + (2, 0, 78848), + (3, 0, 78848), + (4, 0, 78848), + (5, 0, 78848), + (6, 0, 78848), + (7, 0, 78848), + (8, 0, 78848), + (9, 0, 78848), + (10, 0, 78848), + (11, 0, 78848), + (12, 0, 78848), + (13, 0, 78848), + (14, 0, 78848), + (15, 0, 78848), + (16, 65535, 79360), + (17, 65535, 144896), + (18, 65535, 210432), + (19, 65535, 275968), + (20, 65535, 341504), + (21, 65535, 407040), + (22, 65535, 472576), + ] + + stream = device.open() + buf = stream.read(65536) + assert hashlib.sha256(buf).hexdigest() == "cf4adcf1933a8c9a0a3ff5588e1400e6beea8a32212b3a35ba08c7b08e4e6b1f" + + buf = stream.read(65536 * 15) + assert buf.strip(b"\x00") == b"" + + buf = stream.read(65536 * 7) + assert hashlib.sha256(buf).hexdigest() == "8c989a3aa590795fa919ccb7d1f28651c85f8a0d9ba00ab22cdd9fb760fa7955"