diff --git a/Makefile b/Makefile index 7d0677277fa..4850bb0a8db 100644 --- a/Makefile +++ b/Makefile @@ -173,7 +173,7 @@ DUNE_IU_PACKAGES3=-j $(JOBS) --destdir=$(DESTDIR) --prefix=$(OPTDIR) --libdir=$( install-dune3: dune install $(DUNE_IU_PACKAGES3) -DUNE_IU_PACKAGES4=-j $(JOBS) --destdir=$(DESTDIR) --prefix=$(PREFIX) --libdir=$(LIBDIR) --libexecdir=/usr/libexec --mandir=$(MANDIR) vhd-tool forkexec +DUNE_IU_PACKAGES4=-j $(JOBS) --destdir=$(DESTDIR) --prefix=$(PREFIX) --libdir=$(LIBDIR) --libexecdir=/usr/libexec --mandir=$(MANDIR) vhd-tool qcow-tool forkexec install-dune4: dune install $(DUNE_IU_PACKAGES4) diff --git a/dune-project b/dune-project index a3d6651fc45..b599f93b355 100644 --- a/dune-project +++ b/dune-project @@ -420,6 +420,34 @@ ) ) +(package + (name qcow-tool) + (synopsis "Manipulate .qcow files") + (tags ("org.mirage" "org:xapi-project")) + (depends + asetmap + astring + cmdliner + cstruct + ezjsonm + fmt + io-page + logs + lwt + mirage-block + mirage-block-combinators + mirage-block-unix + mirage-time + mirage-types-lwt + ounit + prometheus + result + sexplib + sha + unix-type-representations + ) +) + (package (name vhd-tool) (synopsis "Manipulate .vhd files") diff --git a/ocaml/qcow-tool/.circleci/config.yml b/ocaml/qcow-tool/.circleci/config.yml new file mode 100644 index 00000000000..b771f94d0e1 --- /dev/null +++ b/ocaml/qcow-tool/.circleci/config.yml @@ -0,0 +1,9 @@ +version: 2 +jobs: + build: + docker: + - image: docker:19.03.11 + steps: + - checkout + - setup_remote_docker + - run: docker build . diff --git a/ocaml/qcow-tool/.dockerignore b/ocaml/qcow-tool/.dockerignore new file mode 100644 index 00000000000..8fa1afb92dd --- /dev/null +++ b/ocaml/qcow-tool/.dockerignore @@ -0,0 +1,3 @@ +.git +_build +Dockerfile diff --git a/ocaml/qcow-tool/.gitignore b/ocaml/qcow-tool/.gitignore new file mode 100644 index 00000000000..f3b3b91f874 --- /dev/null +++ b/ocaml/qcow-tool/.gitignore @@ -0,0 +1,16 @@ +*.annot +*.cmo +*.cma +*.cmi +*.a +*.o +*.cmx +*.cmxs +*.cmxa +_build +*.native +.coverage/ +*.install +lib/qcow_word_size.ml +*.exe +*.merlin diff --git a/ocaml/qcow-tool/.travis.yml b/ocaml/qcow-tool/.travis.yml new file mode 100644 index 00000000000..fa6a6f2ae1e --- /dev/null +++ b/ocaml/qcow-tool/.travis.yml @@ -0,0 +1,12 @@ +language: c +sudo: false +services: + - docker +install: wget https://raw.githubusercontent.com/ocaml/ocaml-travisci-skeleton/master/.travis-docker.sh +script: bash ./.travis-docker.sh +env: + global: + - PACKAGE="qcow-tool" + - PINS="qcow:. mirage-block-ramdisk:https://github.com/mirage/mirage-block-ramdisk.git" + matrix: + - DISTRO=alpine OCAML_VERSION=4.09 diff --git a/ocaml/qcow-tool/CHANGES.md b/ocaml/qcow-tool/CHANGES.md new file mode 100644 index 00000000000..1698d4a3806 --- /dev/null +++ b/ocaml/qcow-tool/CHANGES.md @@ -0,0 +1,158 @@ +## 0.11.0 (2020-06-05) +- Update the build to use `dune` (@emillon, #112) +- Update to Mirage 4.0 interfaces (@djs55, #112) +- LICENSE.md: add title and copyright year range (@waldyrious, #109) + +## 0.10.5 (2017-12-14): +- CLI: use the disk locking feature in mirage-block-unix >= 0.9.0 + +## 0.10.4 (2017-12-07): +- fix build on OCaml 4.06 (and -safe-string) +- update to new sha.1.10 signature +- document the prometheus support + +## 0.10.3 (2017-08-02): +- avoid linking ppx tools into the library + +## 0.10.2 (2017-06-18): +- remove false dependency on cmdliner + +## 0.10.1 (2017-06-17): +- update to new io-page/ io-page-unix +- fix prometheus accounting error + +## 0.10.0 (2017-05-13) +- fix a major performance problem with `compact` +- split into 2 packages: qcow and qcow-tool +- add `qcow-tool dehydrate` and `qcow-tool rehydrate` for extracting + metadata for debug/support +- add prometheus metrics for I/O and GC operations +- restore the `qcow-tool compact --progress` progress bar +- add `qcow-tool compact --progress-fd` for json-formatted progress +- build via jbuilder + +## 0.9.5 (2017-03-12) +- CLI: `check` and `sha` will nolonger resize the file as a side-effect + (#84) +- Allow the number of `cluster_bits` to be set in `create` + +## 0.9.4 (2017-03-07) +- Strictly enforce the cluster move state machine +- Don't start moving new blocks while existing moves are in progress + (fix bug where the same destination block could be reused) +- Hold a lock to exclude `flush` while updating references to ensure + reference updates hit the disk before the move is considered complete +- Simplify allocator by always adding blocks to the Roots set before + returning. The caller must transfer them somewhere else. +- Simplify the cluster moving API by combining `get_moves` with + `start_moves`, so it's not possible to block and affect the moves + which can legally be started +- When detecting a duplicate reference or hitting an I/O error, log + analysis of the internal state +- Check for move cancellation before copying a block to avoid accidentally + copying a block which is now outside the file +- Avoid adding a cluster to the Junk set twice during a reference update +- Add lots of assertions + +## 0.9.3 (2017-03-02) +- Hold a read lock on the L1 during read/write +- Minimise locking while updating references +- When moving an L2 cluster, update the cluster map + +## 0.9.2 (2017-02-26) +- Don't hold the global lock while updating references +- Log an error if a client I/O takes more than 30s +- Improve the performance of discard by writing each L2 cluster to disk + only once +- Track clusters which are being erased and copied into, to prevent the + file being shrunk, orphaning them (which typically manifests as a later + double-allocation) + +## 0.9.1 (2017-02-25) +- Add configuration `runtime_assert` to check GC invariants at runtime +- Use tail-recursive calls in the block recycler (which deals with large + block lists) +- Wait for the compaction work list to stabilise before processing it + (otherwise we move blocks which are then immediately discarded) +- Track the difference between blocks on the end of the file being full + of zeroes due to ftruncate versus being full of junk due to discard +- On open, truncate the file to erase trailing junk +- Don't try to use free space between header structures for user data + since we assume all blocks after the start of free space are movable + and header blocks aren't (in this implementation) +- Make cluster locks recursive, hold relevant metadata read locks while + reading or writing data clusters to ensure they aren't moved while + we're using them. +- Add a debug testing mode and use it in a test case to verify that + compact mid-write works as expected. + +## 0.9.0 (2017-02-21) +- Add online coalescing mode and background cluster recycling thread +- Rename internal modules and types +- Ensure the interval tree remains balanced to improve performance + +## 0.8.1 (2017-02-13) +- fix error in META file + +## 0.8.0 (2017-02-13) +- update to Mirage 3 APIs +- now requires OCaml 4.03+ +- ensure the interval tree is kept balanced + +## 0.7.2 (2016-12-21) +- if `discard` is not enabled, fail `discard` calls +- if `discard` is enabled, enable lazy-refcounts and zero refcount clusters + to avoid breaking refcounts over `discard`, `compact` + +## 0.7.1 (2016-12-15) +- speed up `check` and `compact` up to 50x +- `qcow-tool compact` work around files which aren't a whole number of + sectors + +## 0.7.0 (2016-12-10) +- now functorised over `TIME` +- allow background compact to be cancelled +- cancel background compact to allow regular I/O to go through +- don't trigger the background compact until 1s after the last + `discard` +- on `connect`, sanity-check the image + +## 0.6.0 (2016-12-04) +- rename ocamlfind package from `qcow-format` to `qcow` for uniformity +- add support for runtime configuration arguments to `connect` and `create` +- add support for `discard` (aka TRIM or UNMAP) and online compaction + (through a stop-the-world GC) +- switch the build from `oasis` to `topkg` (thanks to @jgimenez) + +## 0.5.0 (2016-11-26) +- `resize` now takes a new size in bytes (rather than sectors) and uses a + labelled argument +- `qcow-tool info` now takes a `--filter ` for example + `qcow-tool info ... --filter .size` to view the virtual size + +## 0.4.2 (2016-09-21) +- Don't break the build if `Block.connect` has optional arguments + +## 0.4.1 (2016-08-17) +- Remove one necessary source of `flush` calls +- CLI: add `mapped` command to list the mapped regions of a file + +## 0.4 (2016-08-03) +- For buffered block devices, call `flush` to guarantee metadata correctness +- In lazy_refcounts mode (the default), do not compute any refcounts +- CLI: the `repair` command should recompute refcounts + +## 0.3 (2016-05-12) +- Depend on ppx, require OCaml 4.02+ + +## 0.2 (2016-01-15) +- Use qcow version 3 by default, setting `lazy_refcount=on` +- Unit tests now verify that `qemu-img check` is happy and that `qemu-nbd` + sees the same data we wrote + +## 0.1 (2015-11-09) +- initial `V1_LWT.BLOCK` support +- caches metadata for performance +- CLI tool for manipulating images +- supports the `seek_mapped` `seek_unmapped` interface for iterating over + sparse regions diff --git a/ocaml/qcow-tool/Dockerfile b/ocaml/qcow-tool/Dockerfile new file mode 100644 index 00000000000..401ac90a157 --- /dev/null +++ b/ocaml/qcow-tool/Dockerfile @@ -0,0 +1,16 @@ +FROM alpine:latest AS build + +RUN apk add opam alpine-sdk + +RUN opam init -y --disable-sandboxing --comp=4.10.0 +RUN opam install depext -y +COPY . /src +RUN opam pin add qcow.dev /src -n +RUN opam depext -i qcow -y +RUN opam pin add qcow-tool.dev /src -n +RUN opam depext -i qcow-tool -y + +FROM alpine:latest +COPY --from=build /root/.opam/4.10.0/bin/qcow-tool /qcow-tool +ENTRYPOINT ["/qcow-tool"] + diff --git a/ocaml/qcow-tool/LICENSE.md b/ocaml/qcow-tool/LICENSE.md new file mode 100644 index 00000000000..e41936eb0c9 --- /dev/null +++ b/ocaml/qcow-tool/LICENSE.md @@ -0,0 +1,18 @@ +(* + * ISC License + * + * Copyright (c) 2015-2018 + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) diff --git a/ocaml/qcow-tool/Makefile b/ocaml/qcow-tool/Makefile new file mode 100644 index 00000000000..e1358dfae6e --- /dev/null +++ b/ocaml/qcow-tool/Makefile @@ -0,0 +1,14 @@ + +.PHONY: build clean + +build: + dune build @install + +install: + dune install + +uninstall: + dune uninstall + +clean: + dune clean diff --git a/ocaml/qcow-tool/README.md b/ocaml/qcow-tool/README.md new file mode 100644 index 00000000000..7830befd9a6 --- /dev/null +++ b/ocaml/qcow-tool/README.md @@ -0,0 +1,94 @@ +Ocaml support for Qcow2 images +============================== + +[![Build Status](https://travis-ci.org/mirage/ocaml-qcow.png?branch=master)](https://travis-ci.org/mirage/ocaml-qcow) [![Coverage Status](https://coveralls.io/repos/mirage/ocaml-qcow/badge.png?branch=master)](https://coveralls.io/r/mirage/ocaml-qcow?branch=master) + +Please read [the API documentation](https://mirage.github.io/ocaml-qcow/). + +Features +-------- + +- supports `resize` +- exposes sparseness information +- produces files which can be understood by qemu (although not in + reverse since we don't support many features) + +Example +------- + +In a top-level like utop: +```ocaml +# #require "io-page.unix";; +# #require "mirage-block";; +# #require "mirage-block-ramdisk";; +# #require "qcow";; +# #require "lwt.syntax";; + +# lwt t_or_error = Ramdisk.create ~name:"hello" ~size_sectors:1024L ~sector_size:512;; +val t_or_error : [ `Error of Ramdisk.error | `Ok of Ramdisk.t ] = `Ok + +# let t = Mirage_block.Error.ok_exn t_or_error;; +val t : Ramdisk.t = + +# module Qcow_on_ramdisk = Qcow.Make(Ramdisk);; +module Qcow_on_ramdisk : sig type page_aligned_buffer = Ramdisk.page_aligned_buffer + type error = + [ `Disconnected | `Is_read_only | `Unimplemented | `Unknown of bytes ] + type 'a io = 'a Ramdisk.io + type t = Qcow.Make(Ramdisk).t + type id = Qcow.Make(Ramdisk).id + val disconnect : t -> unit io + type info = + Qcow.Make(Ramdisk).info = { + read_write : bool; + sector_size : int; + size_sectors : int64; + } + val get_info : t -> info io + val read : + t -> + int64 -> page_aligned_buffer list -> [ `Error of error | `Ok of unit ] io + val write : + t -> + int64 -> page_aligned_buffer list -> [ `Error of error | `Ok of unit ] io + val create : Ramdisk.t -> int64 -> [ `Error of error | `Ok of t ] io + val connect : Ramdisk.t -> t io + val resize : t -> int64 -> [ `Error of error | `Ok of unit ] io + val seek_unmapped : t -> int64 -> [ `Error of error | `Ok of int64 ] io + val seek_mapped : t -> int64 -> [ `Error of error | `Ok of int64 ] io + val rebuild_refcount_table : t -> [ `Error of error | `Ok of unit ] io + val header : t -> Qcow.Header.t + module Debug : + sig + type t = Qcow.Make(Ramdisk).t + type error = error + val check_no_overlaps : t -> [ `Error of error | `Ok of unit ] io + val set_next_cluster : t -> int64 -> unit + end + end + +# lwt t_or_error = Qcow_on_ramdisk.create t 1048576L;; +val t_or_error : [ `Error of Qcow_on_ramdisk.error | `Ok of Qcow_on_ramdisk.t ] + = `Ok + +# let t = Mirage_block.Error.ok_exn t_or_error;; +val t : Qcow_on_ramdisk.t = + +# let page = Io_page.(to_cstruct (get 1));; +val page : Ramdisk.page_aligned_buffer = + {Cstruct.buffer = ; off = 0; len = 4096} + +# lwt result_or_error = Qcow_on_ramdisk.read t 0L [ page ];; +val result_or_error : [ `Error of Ramdisk.error | `Ok of unit ] = `Ok () + +# lwt ok_or_error = Mirage_block.sparse_copy (module Ramdisk) t (module Ramdisk) t;; +val ok_or_error : + [ `Error of [> `Different_sizes | `Is_read_only | `Msg of bytes ] + | `Ok of unit ] = `Ok () +``` + +Limitations +----------- + +- cluster size is fixed at 64-bits +- no support for snapshots diff --git a/ocaml/qcow-tool/cli/common.ml b/ocaml/qcow-tool/cli/common.ml new file mode 100644 index 00000000000..7870d5559c3 --- /dev/null +++ b/ocaml/qcow-tool/cli/common.ml @@ -0,0 +1,20 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +type t = {debug: bool; progress: bool; progress_fd: int option} + +let make debug progress progress_fd = {debug; progress; progress_fd} diff --git a/ocaml/qcow-tool/cli/dune b/ocaml/qcow-tool/cli/dune new file mode 100644 index 00000000000..ecb8844d68e --- /dev/null +++ b/ocaml/qcow-tool/cli/dune @@ -0,0 +1,31 @@ +(executable + (name main) + (libraries + astring + cmdliner + cstruct + cstruct-lwt + fmt + io-page logs + logs.fmt + lwt + lwt.unix + mirage-block + mirage-block-unix + mirage-block-combinators + mirage-time + qcow + sexplib + sha + unix-type-representations + ) + (preprocess + (pps ppx_sexp_conv) + ) +) + +(install + (package qcow-tool) + (section bin) + (files (main.exe as qcow-tool)) +) diff --git a/ocaml/qcow-tool/cli/impl.ml b/ocaml/qcow-tool/cli/impl.ml new file mode 100644 index 00000000000..226a9f4b90a --- /dev/null +++ b/ocaml/qcow-tool/cli/impl.ml @@ -0,0 +1,850 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Result +open Sexplib.Std +open Qcow + +let expect_ok = function Ok x -> x | Error (`Msg m) -> failwith m + +let ( >>*= ) m f = + let open Lwt in + m >>= function Error x -> Lwt.return (Error x) | Ok x -> f x + +let src = + let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in + Logs.Src.set_level src (Some Logs.Info) ; + src + +module Log = (val Logs.src_log src : Logs.LOG) + +module ReadWriteBlock = struct + include Block + + let original_connect = connect + + let connect path = connect ~lock:true path +end + +module Time = struct + type 'a io = 'a Lwt.t + + let sleep_ns ns = Lwt_unix.sleep (Int64.to_float ns /. 1e9) +end + +module TracedBlock = struct + include ReadWriteBlock + + let length_of bufs = List.fold_left ( + ) 0 (List.map Cstruct.length bufs) + + let read t sector bufs = + Log.info (fun f -> f "BLOCK.read %Ld len = %d" sector (length_of bufs)) ; + read t sector bufs + + let write t sector bufs = + Log.info (fun f -> f "BLOCK.write %Ld len = %d" sector (length_of bufs)) ; + write t sector bufs + + let flush t = + Log.info (fun f -> f "BLOCK.flush") ; + flush t + + let resize t new_size = + Log.info (fun f -> f "BLOCK.resize %Ld" new_size) ; + resize t new_size +end + +module type BLOCK = sig + include Qcow_s.RESIZABLE_BLOCK + + val connect : string -> t Lwt.t +end + +module UnsafeBlock = struct + include ReadWriteBlock + + let flush _ = Lwt.return (Ok ()) +end + +module ReadOnlyBlock = struct + include UnsafeBlock + + let connect path = original_connect ~lock:false path + + let write _ _ _ = failwith "write to a read-only virtual device" + + let resize _ _ = failwith "attempt to resize a read-only virtual device" +end + +let handle_common common_options_t = + if common_options_t.Common.debug then + List.iter + (fun src -> + if Logs.Src.name src = "qcow" then + Logs.Src.set_level src (Some Logs.Debug) + ) + (Logs.Src.list ()) + +let spinner = [|'-'; '\\'; '|'; '/'|] + +let spinner_idx = ref 0 + +let progress_bar_width = 70 + +let last_percent = ref (-1) + +let last_spinner_time = ref (Unix.gettimeofday ()) + +let progress_cb ~percent = + let now = Unix.gettimeofday () in + if now -. !last_spinner_time > 0.5 || !last_percent <> percent then ( + last_percent := percent ; + last_spinner_time := now ; + let line = Bytes.make (progress_bar_width + 8) '\000' in + + let len = progress_bar_width * percent / 100 in + for i = 0 to len - 1 do + Bytes.set line (4 + i) (if i = len - 1 then '>' else '#') + done ; + Bytes.set line 0 '[' ; + Bytes.set line 1 spinner.(!spinner_idx) ; + Bytes.set line 2 ']' ; + Bytes.set line 3 ' ' ; + spinner_idx := (!spinner_idx + 1) mod Array.length spinner ; + let percent' = Printf.sprintf "%3d%%" percent in + String.blit percent' 0 line (progress_bar_width + 4) 4 ; + Printf.printf "\r%s%!" (Bytes.to_string line) ; + if percent = 100 then Printf.printf "\n" + ) + +let progress_cb_json fd = + let oc = Unix.out_channel_of_descr fd in + let last_percent = ref (-1) in + fun ~percent -> + if !last_percent <> percent then ( + last_percent := percent ; + output_string oc (Printf.sprintf "{ \"progress\": %d }\n" percent) ; + flush_all () + ) + +let mib = Int64.mul 1024L 1024L + +let info filename filter = + let t = + let open Lwt in + Lwt_unix.openfile filename [Lwt_unix.O_RDONLY] 0 >>= fun fd -> + let buffer = Cstruct.create 1024 in + Lwt_cstruct.complete (Lwt_cstruct.read fd) buffer >>= fun () -> + let h, _ = expect_ok (Header.read buffer) in + let original_sexp = Header.sexp_of_t h in + let sexp = + match filter with + | None -> + original_sexp + | Some str -> + Sexplib.Path.get ~str original_sexp + in + Printf.printf "%s\n" (Sexplib.Sexp.to_string_hum sexp) ; + return (`Ok ()) + in + Lwt_main.run t + +let write filename sector data trace = + let block = + if trace then + (module TracedBlock : BLOCK) + else + (module ReadWriteBlock : BLOCK) + in + let module BLOCK = (val block : BLOCK) in + let module B = Qcow.Make (BLOCK) (Time) in + let t = + let open Lwt in + BLOCK.connect filename >>= fun x -> + B.connect x >>= fun x -> + let npages = (String.length data + 4095) / 4096 in + let buf = Io_page.(to_cstruct (get npages)) in + Cstruct.memset buf 0 ; + Cstruct.blit_from_string data 0 buf 0 (String.length data) ; + B.write x sector [buf] >>= function + | Error _ -> + failwith "write failed" + | Ok () -> + return (`Ok ()) + in + Lwt_main.run t + +let read filename sector length trace = + let block = + if trace then + (module TracedBlock : BLOCK) + else + (module ReadWriteBlock : BLOCK) + in + let module BLOCK = (val block : BLOCK) in + let module B = Qcow.Make (BLOCK) (Time) in + let t = + let open Lwt in + BLOCK.connect filename >>= fun x -> + B.connect x >>= fun x -> + let length = Int64.to_int length * 512 in + let npages = (length + 4095) / 4096 in + let buf = Io_page.(to_cstruct (get npages)) in + B.read x sector [buf] >>= function + | Error _ -> + failwith "write failed" + | Ok () -> + let result = Cstruct.sub buf 0 length in + Printf.printf "%s%!" (Cstruct.to_string result) ; + return (`Ok ()) + in + Lwt_main.run t + +let check filename = + let module B = Qcow.Make (ReadOnlyBlock) (Time) in + let open Lwt in + let t = + let rec retry = function + | 0 -> + Printf.fprintf stderr "Warning: file is being concurrently modified\n" ; + Printf.fprintf stderr + "We found no concrete problems, but the file is being modified too\n" ; + Printf.fprintf stderr "quickly for us to read a consistent view.\n" ; + return (`Ok ()) + | n -> ( + ReadOnlyBlock.connect filename >>= fun block -> + B.check block >>= function + | Error (`Reference_outside_file (src, dst)) -> + ReadOnlyBlock.disconnect block >>= fun () -> + ReadOnlyBlock.connect filename >>= fun block -> + ReadOnlyBlock.get_info block >>= fun info -> + let size = + Int64.( + mul info.Mirage_block.size_sectors + (of_int info.Mirage_block.sector_size) + ) + in + if dst > size then ( + Printf.fprintf stderr + "Error: detected a reference outside the file, from %Ld to \ + %Ld while the file size is %Ld\n\ + %!" + src dst size ; + exit 1 + ) else (* The file has grown, try again *) + ReadOnlyBlock.disconnect block >>= fun () -> retry (n - 1) + | Error _ -> + failwith + (Printf.sprintf "Qcow consistency check failed on %s" filename) + | Ok x -> + Printf.printf "Qcow file seems intact.\n" ; + Printf.printf "Total free blocks: %Ld\n" x.B.free ; + Printf.printf "Total used blocks: %Ld\n" x.B.used ; + return (`Ok ()) + ) + in + retry 5 + in + Lwt_main.run t + +exception Non_zero + +(* slow but performance is not a concern *) +let is_zero buffer = + try + for i = 0 to Cstruct.length buffer - 1 do + if Cstruct.get_uint8 buffer i <> 0 then raise Non_zero + done ; + true + with Non_zero -> false + +let handle_error pp_error = function + | Error e -> + let msg = Format.asprintf "%a" pp_error e in + Lwt.return (`Error (false, msg)) + | Ok x -> + Lwt.return (`Ok x) + +let discard unsafe_buffering filename = + let block = + if unsafe_buffering then + (module UnsafeBlock : BLOCK) + else + (module ReadWriteBlock : BLOCK) + in + let module BLOCK = (val block : BLOCK) in + let module B = Qcow.Make (BLOCK) (Time) in + let open Lwt in + let t = + BLOCK.connect filename >>= fun x -> + BLOCK.get_info x >>= fun info -> + B.connect x >>= fun x -> + let module F = Mirage_block_combinators.Fast_fold (B) in + F.mapped_s + ~f:(fun acc sector buffer -> + if is_zero buffer then ( + let len = Cstruct.length buffer in + assert (len mod info.Mirage_block.sector_size = 0) ; + let n = Int64.of_int @@ (len / info.Mirage_block.sector_size) in + if Int64.add sector n = info.Mirage_block.size_sectors then + (* The last block in the file: this is our last chance to discard *) + let sector = match acc with None -> sector | Some x -> x in + let n = Int64.sub info.Mirage_block.size_sectors sector in + let open Lwt.Infix in + B.discard x ~sector ~n () >>= function + | Error _ -> + Lwt.fail_with "error discarding block" + | Ok () -> + Lwt.return None + else (* start/extend the current zero region *) + let acc = match acc with None -> Some sector | Some x -> Some x in + Lwt.return acc + ) else + match acc with + | Some start -> ( + (* we accumulated zeros: discard them now *) + let n = Int64.sub sector start in + let open Lwt.Infix in + B.discard x ~sector:start ~n () >>= function + | Error _ -> + Lwt.fail_with "error discarding block" + | Ok () -> + Lwt.return None + ) + | None -> + Lwt.return None + ) + None x + >>*= fun _ -> return (Ok ()) + in + Lwt_main.run (t >>= handle_error B.pp_error) + +let compact common_options_t unsafe_buffering filename = + handle_common common_options_t ; + let block = + if unsafe_buffering then + (module UnsafeBlock : BLOCK) + else + (module ReadWriteBlock : BLOCK) + in + let module BLOCK = (val block : BLOCK) in + let module B = Qcow.Make (BLOCK) (Time) in + let open Lwt in + let progress_cb = + match + (common_options_t.Common.progress, common_options_t.Common.progress_fd) + with + | true, None -> + Some progress_cb + | _, Some fd -> + Some (progress_cb_json (Unix_representations.file_descr_of_int fd)) + | _, _ -> + None + in + + (* workaround for https://github.com/mirage/mirage-block-unix/issues/59 *) + Lwt_main.run + (let open Lwt.Infix in + Lwt_unix.LargeFile.stat filename >>= fun stat -> + let bytes = stat.Lwt_unix.LargeFile.st_size in + let remainder = Int64.rem bytes 512L in + let padding_required = + if remainder = 0L then 0L else Int64.sub 512L remainder + in + Lwt_unix.openfile filename [Lwt_unix.O_WRONLY; Lwt_unix.O_APPEND] 0o0 + >>= fun fd -> + let buf = Cstruct.create (Int64.to_int padding_required) in + Cstruct.memset buf 0 ; + Lwt_cstruct.complete (Lwt_cstruct.write fd) buf >>= fun () -> + Lwt_unix.close fd + ) ; + let t = + BLOCK.connect filename >>= fun x -> + B.connect x >>= fun x -> + B.get_info x >>= fun info -> + B.compact x ?progress_cb () >>*= fun report -> + ( if report.B.old_size = report.B.new_size then + Printf.printf + "I couldn't make the file any smaller. Consider running `discard`.\n" + else + let smaller_sectors = Int64.sub report.B.old_size report.B.new_size in + let sector_size = Int64.of_int info.Mirage_block.sector_size in + let smaller_mib = Int64.(div (mul smaller_sectors sector_size) mib) in + Printf.printf "The file is now %Ld MiB smaller.\n" smaller_mib + ) ; + B.Debug.check_no_overlaps x + in + Lwt_main.run (t >>= handle_error B.pp_write_error) + +let repair unsafe_buffering filename = + let block = + if unsafe_buffering then + (module UnsafeBlock : BLOCK) + else + (module ReadWriteBlock : BLOCK) + in + let module BLOCK = (val block : BLOCK) in + let module B = Qcow.Make (BLOCK) (Time) in + let open Lwt in + let t = + BLOCK.connect filename >>= fun x -> + B.connect x >>= fun x -> + B.rebuild_refcount_table x >>*= fun () -> B.Debug.check_no_overlaps x + in + Lwt_main.run (t >>= handle_error B.pp_write_error) + +let sha _common_options_t filename = + let module B = Qcow.Make (ReadOnlyBlock) (Time) in + let open Lwt in + let t = + Block.connect filename >>= fun x -> + let config = B.Config.create ~read_only:true () in + B.connect ~config x >>= fun x -> + B.get_info x >>= fun info -> + let ctx = Sha1.init () in + let update_cstruct c = + let b' = c.Cstruct.buffer in + if c.Cstruct.off = 0 && c.Cstruct.len = Bigarray.Array1.dim b' then + Sha1.update_buffer ctx b' + else + let c' = Cstruct.create (Cstruct.length c) in + Cstruct.blit c 0 c' 0 (Cstruct.length c) ; + let b' = c'.Cstruct.buffer in + Sha1.update_buffer ctx b' + in + let buf = Io_page.(to_cstruct @@ get 1024) in + let buf_sectors = + Int64.of_int (Cstruct.length buf / info.Mirage_block.sector_size) + in + let rec loop sector = + let remaining = Int64.sub info.Mirage_block.size_sectors sector in + if remaining = 0L then + Lwt.return_unit + else + let n = min buf_sectors remaining in + let buf = + Cstruct.sub buf 0 (Int64.to_int n * info.Mirage_block.sector_size) + in + B.read x sector [buf] >>= function + | Error _ -> + Lwt.fail_with (Printf.sprintf "Failed to read sector %Ld" sector) + | Ok () -> + update_cstruct buf ; + loop (Int64.add sector n) + in + loop 0L >>= fun () -> + let digest = Sha1.finalize ctx in + Printf.printf "%s\n" (Sha1.to_hex digest) ; + return (`Ok ()) + in + Lwt_main.run t + +let decode filename output = + let module B = Qcow.Make (Block) (Time) in + let open Lwt in + let t = + Block.connect filename >>= fun x -> + B.connect x >>= fun x -> + B.get_info x >>= fun info -> + let total_size = + Int64.( + mul info.Mirage_block.size_sectors (of_int info.Mirage_block.sector_size) + ) + in + Lwt_unix.openfile output [Lwt_unix.O_WRONLY; Lwt_unix.O_CREAT] 0o0644 + >>= fun fd -> + Lwt_unix.LargeFile.ftruncate fd total_size >>= fun () -> + Lwt_unix.close fd >>= fun () -> + Block.connect output >>= fun y -> + let module C = Mirage_block_combinators.Copy (B) (Block) in + C.v ~src:x ~dst:y >>= function + | Error _ -> + failwith "copy failed" + | Ok () -> + return (`Ok ()) + in + Lwt_main.run t + +let encode filename output = + let module B = Qcow.Make (ReadWriteBlock) (Time) in + let open Lwt in + let t = + ReadWriteBlock.connect filename >>= fun raw_input -> + ReadWriteBlock.get_info raw_input >>= fun raw_input_info -> + let total_size = + Int64.( + mul raw_input_info.Mirage_block.size_sectors + (of_int raw_input_info.Mirage_block.sector_size) + ) + in + Lwt_unix.openfile output [Lwt_unix.O_WRONLY; Lwt_unix.O_CREAT] 0o0644 + >>= fun fd -> + Lwt_unix.close fd >>= fun () -> + ReadWriteBlock.connect output >>= fun raw_output -> + B.create raw_output ~size:total_size () >>= function + | Error _ -> + failwith + (Printf.sprintf "Failed to create qcow formatted data on %s" output) + | Ok qcow_output -> ( + let module C = Mirage_block_combinators.Copy (Block) (B) in + C.v ~src:raw_input ~dst:qcow_output >>= function + | Error _ -> + failwith "copy failed" + | Ok () -> + return (`Ok ()) + ) + in + Lwt_main.run t + +let create size strict_refcounts trace filename = + let block = + if trace then + (module TracedBlock : BLOCK) + else + (module ReadWriteBlock : BLOCK) + in + let module BLOCK = (val block : BLOCK) in + let module B = Qcow.Make (BLOCK) (Time) in + let open Lwt in + let t = + Lwt_unix.openfile filename [Lwt_unix.O_CREAT] 0o0644 >>= fun fd -> + Lwt_unix.close fd >>= fun () -> + BLOCK.connect filename >>= fun x -> + B.create x ~size ~lazy_refcounts:(not strict_refcounts) () >>= function + | Error _ -> + failwith + (Printf.sprintf "Failed to create qcow formatted data on %s" filename) + | Ok _ -> + return (`Ok ()) + in + Lwt_main.run t + +let pattern common_options_t trace filename size number = + let block = + if trace then + (module TracedBlock : BLOCK) + else + (module ReadWriteBlock : BLOCK) + in + let module Uncached = (val block : BLOCK) in + let module BLOCK = Qcow_block_cache.Make (Uncached) in + let module B = Qcow.Make (BLOCK) (Time) in + let open Lwt in + let progress_cb = + if common_options_t.Common.progress then Some progress_cb else None + in + let t = + Lwt_unix.openfile filename [Lwt_unix.O_CREAT] 0o0644 >>= fun fd -> + Lwt_unix.close fd >>= fun () -> + Uncached.connect filename >>= fun uncached -> + BLOCK.connect uncached >>= fun x -> + let config = B.Config.create ~discard:true () in + B.create x ~size ~lazy_refcounts:true ~config () >>= function + | Error _ -> + failwith + (Printf.sprintf "Failed to create qcow formatted data on %s" filename) + | Ok qcow -> ( + let h = B.header qcow in + B.get_info qcow >>= fun info -> + let sector_size = info.Mirage_block.sector_size in + let cluster_bits = Int32.to_int h.Qcow.Header.cluster_bits in + let cluster_size = 1 lsl cluster_bits in + let cluster_size_sectors = cluster_size / sector_size in + match number with + | 1 -> + (* write to every other cluster: this should be worst case for the + interval tree structure. *) + let page = Io_page.(to_cstruct @@ get 1) in + let buf = Cstruct.sub page 0 sector_size in + let rec loop sector = + if sector >= info.Mirage_block.size_sectors then + Lwt.return_unit + else + let percent = + Int64.( + to_int (div (mul 100L sector) info.Mirage_block.size_sectors) + ) + in + (match progress_cb with Some f -> f ~percent | None -> ()) ; + (* Mark each sector with the sector number *) + Cstruct.BE.set_uint64 buf 0 sector ; + B.write qcow sector [buf] >>= function + | Error _ -> + Lwt.fail_with "qcow write error" + | Ok () -> + (* every other cluster *) + loop + Int64.(add sector (mul 2L (of_int cluster_size_sectors))) + in + loop 0L >>= fun () -> + BLOCK.disconnect x >>= fun () -> Lwt.return (`Ok ()) + | 2 -> + (* write to every cluster, and then discard every other cluster: this + should be worst case for the compactor *) + let pages = Io_page.(to_cstruct @@ get 1024) in + (* 4 MiB *) + Cstruct.memset pages 0 ; + let sectors = Cstruct.length pages / sector_size in + let rec loop sector = + if sector >= info.Mirage_block.size_sectors then + Lwt.return_unit + else + let percent = + Int64.( + to_int (div (mul 50L sector) info.Mirage_block.size_sectors) + ) + in + (match progress_cb with Some f -> f ~percent | None -> ()) ; + let to_write = + min sectors + Int64.(to_int (sub info.Mirage_block.size_sectors sector)) + in + let buf = Cstruct.sub pages 0 (to_write * sector_size) in + let rec watermark n = + if n >= to_write then + () + else ( + Cstruct.BE.set_uint64 buf (n * sector_size) + Int64.(add sector (of_int n)) ; + watermark (n + 1) + ) + in + watermark to_write ; + B.write qcow sector [buf] >>= function + | Error _ -> + Lwt.fail_with "qcow write error" + | Ok () -> + loop Int64.(add sector (of_int to_write)) + in + loop 0L >>= fun () -> + let rec loop sector = + if sector >= info.Mirage_block.size_sectors then + Lwt.return_unit + else + let percent = + 50 + + Int64.( + to_int + (div (mul 50L sector) info.Mirage_block.size_sectors) + ) + in + (match progress_cb with Some f -> f ~percent | None -> ()) ; + B.discard qcow ~sector ~n:(Int64.of_int cluster_size_sectors) () + >>= function + | Error _ -> + Lwt.fail_with "qcow discard error" + | Ok () -> + (* every other cluster *) + loop + Int64.(add sector (mul 2L (of_int cluster_size_sectors))) + in + loop 0L >>= fun () -> + BLOCK.disconnect x >>= fun () -> Lwt.return (`Ok ()) + | _ -> + failwith (Printf.sprintf "Unknown pattern %d" number) + ) + in + Lwt_main.run t + +let resize trace filename new_size ignore_data_loss = + let block = + if trace then + (module TracedBlock : BLOCK) + else + (module ReadWriteBlock : BLOCK) + in + let module BLOCK = (val block : BLOCK) in + let module B = Qcow.Make (BLOCK) (Time) in + let open Lwt in + let t = + BLOCK.connect filename >>= fun block -> + B.connect block >>= fun qcow -> + B.get_info qcow >>= fun info -> + let data_loss = + let existing_size = + Int64.( + mul info.Mirage_block.size_sectors + (of_int info.Mirage_block.sector_size) + ) + in + existing_size > new_size + in + if (not ignore_data_loss) && data_loss then + return + (`Error + ( false + , "Making a disk smaller results in data loss:\n\ + disk is currently %Ld bytes which is larger than requested %Ld\n\ + .Please see the --ignore-data-loss option." + ) + ) + else + B.resize qcow ~new_size ~ignore_data_loss () >>= function + | Error _ -> + failwith + (Printf.sprintf "Failed to resize qcow formatted data on %s" + filename + ) + | Ok _ -> + return (`Ok ()) + in + Lwt_main.run t + +type output = [`Text | `Json] + +let is_zero buf = + let rec loop ofs = + ofs >= Cstruct.length buf + || (Cstruct.get_uint8 buf ofs = 0 && loop (ofs + 1)) + in + loop 0 + +let mapped filename _format ignore_zeroes = + let module B = Qcow.Make (Block) (Time) in + let open Lwt in + let t = + Block.connect filename >>= fun x -> + B.connect x >>= fun x -> + B.get_info x >>= fun info -> + Printf.printf "# offset (bytes), length (bytes)\n" ; + let module F = Mirage_block_combinators.Fast_fold (B) in + F.mapped_s + ~f:(fun () sector_ofs data -> + let sector_bytes = + Int64.(mul sector_ofs (of_int info.Mirage_block.sector_size)) + in + if (not ignore_zeroes) || not (is_zero data) then + Printf.printf "%Lx %d\n" sector_bytes (Cstruct.length data) ; + Lwt.return_unit + ) + () x + >>*= fun () -> return (Ok ()) + in + Lwt_main.run (t >>= handle_error B.pp_error) + +let finally f g = + try + let r = f () in + g () ; r + with e -> g () ; raise e + +type metadata = {blocks: Qcow.Int64.IntervalSet.t; total_size: int64} +[@@deriving sexp] + +let dehydrate _common input_filename output_filename = + let module B = Qcow.Make (ReadOnlyBlock) (Time) in + let open Lwt.Infix in + let t = + (* NB: all resources are only freed when the CLI exits. Don't copy this + code into a long-running program! *) + + (* Extract the set of metadata blocks *) + Block.connect input_filename >>= fun x -> + Block.get_info x >>= fun info -> + let total_size = + Int64.( + mul info.Mirage_block.size_sectors (of_int info.Mirage_block.sector_size) + ) + in + + let config = B.Config.create ~read_only:true () in + B.connect ~config x >>= fun qcow -> + let blocks = B.Debug.metadata_blocks qcow in + (* Open input and output file descriptors *) + let buffer = Cstruct.create 1048576 in + Lwt_unix.openfile input_filename [Unix.O_RDONLY] 0 >>= fun input_fd -> + Lwt_unix.openfile + (output_filename ^ ".meta") + [Unix.O_EXCL; Unix.O_CREAT; Unix.O_WRONLY] + 0o0644 + >>= fun output_fd -> + (* Append the metadata intervals from the `input_fd` to `metadata_fd` *) + Qcow.Int64.IntervalSet.fold_s + (fun i () -> + let x, y = Qcow.Int64.(IntervalSet.Interval.(x i, y i)) in + let rec loop x = + let remaining = Int64.(succ @@ sub y x) in + if remaining = 0L then + Lwt.return_unit + else + let this_time = + min (Cstruct.length buffer) (Int64.to_int remaining) + in + let fragment = Cstruct.sub buffer 0 this_time in + Lwt_unix.LargeFile.lseek input_fd x Lwt_unix.SEEK_SET >>= fun _ -> + Lwt_cstruct.(complete (read input_fd) fragment) >>= fun () -> + Lwt_cstruct.(complete (write output_fd) fragment) >>= fun () -> + loop (Int64.add x remaining) + in + loop x + ) + blocks () + >>= fun () -> + let metadata = {blocks; total_size} in + let sexp = sexp_of_metadata metadata in + Sexplib.Sexp.save_hum ~perm:0o0644 (output_filename ^ ".map") sexp ; + Lwt.return (`Ok ()) + in + Lwt_main.run t + +let rehydrate _common input_filename output_filename = + let open Lwt.Infix in + let t = + (* NB: all resources are only freed when the CLI exits. Don't copy this + code into a long-running program! *) + let sexp = Sexplib.Sexp.load_sexp (input_filename ^ ".map") in + let metadata = metadata_of_sexp sexp in + (* Open input and output file descriptors *) + let buffer = Cstruct.create 1048576 in + Lwt_unix.openfile (input_filename ^ ".meta") [Unix.O_RDONLY] 0o0644 + >>= fun input_fd -> + Lwt_unix.openfile output_filename + [Unix.O_EXCL; Unix.O_CREAT; Unix.O_WRONLY] + 0o0644 + >>= fun output_fd -> + Lwt_unix.LargeFile.lseek output_fd + (Int64.pred metadata.total_size) + Lwt_unix.SEEK_SET + >>= fun _ -> + Lwt_unix.write output_fd (Bytes.of_string "\000") 0 1 >>= fun _ -> + (* Append the metadata intervals from the `input_fd` to `metadata_fd` *) + Qcow.Int64.IntervalSet.fold_s + (fun i () -> + let x, y = Qcow.Int64.(IntervalSet.Interval.(x i, y i)) in + let rec loop x = + let remaining = Int64.(succ @@ sub y x) in + if remaining = 0L then + Lwt.return_unit + else + let this_time = + min (Cstruct.length buffer) (Int64.to_int remaining) + in + let fragment = Cstruct.sub buffer 0 this_time in + Lwt_unix.LargeFile.lseek output_fd x Lwt_unix.SEEK_SET >>= fun _ -> + Lwt_cstruct.(complete (read input_fd) fragment) >>= fun () -> + Lwt_cstruct.(complete (write output_fd) fragment) >>= fun () -> + loop (Int64.add x remaining) + in + loop x + ) + metadata.blocks () + >>= fun () -> Lwt.return (`Ok ()) + in + Lwt_main.run t + +let stream _common source output = + failwith + (Printf.sprintf "streaming from %s to %s is not implemented" source output) diff --git a/ocaml/qcow-tool/cli/main.ml b/ocaml/qcow-tool/cli/main.ml new file mode 100644 index 00000000000..d041d2a0c5c --- /dev/null +++ b/ocaml/qcow-tool/cli/main.ml @@ -0,0 +1,511 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Astring + +let project_url = "http://github.com/mirage/ocaml-qcow" + +open Cmdliner + +(* Help sections common to all commands *) + +let _common_options = "COMMON OPTIONS" + +let help = + [ + `S _common_options + ; `P "These options are common to all commands." + ; `S "MORE HELP" + ; `P "Use `$(mname) $(i,COMMAND) --help' for help on a single command." + ; `Noblank + ; `S "BUGS" + ; `P (Printf.sprintf "Check bug reports at %s" project_url) + ] + +(* Options common to all commands *) +let common_options_t = + let docs = _common_options in + let debug = + let doc = "Give only debug output." in + Arg.(value & flag & info ["debug"] ~docs ~doc) + in + let progress = + let doc = "Display a progress bar." in + Arg.(value & flag & info ["progress"] ~docs ~doc) + in + let progress_fd = + let doc = "Write machine-readable progress output." in + Arg.(value & opt (some int) None & info ["progress-fd"] ~docs ~doc) + in + Term.(const Common.make $ debug $ progress $ progress_fd) + +let filename = + let doc = Printf.sprintf "Path to the qcow2 file." in + Arg.(value & pos 0 file "test.qcow2" & info [] ~doc) + +let kib = 1024L + +let mib = Int64.mul kib 1024L + +let gib = Int64.mul mib 1024L + +let tib = Int64.mul gib 1024L + +let pib = Int64.mul tib 1024L + +let sizes = + List.sort + (fun (_, a) (_, b) -> compare a b) + [("KiB", kib); ("MiB", mib); ("GiB", gib); ("TiB", tib); ("PiB", pib)] + +let size_parser txt = + let prefix suffix txt = + let suffix' = String.length suffix in + let txt' = String.length txt in + String.with_range ~len:(txt' - suffix') txt + in + try + match + List.fold_left + (fun acc (suffix, multiplier) -> + match acc with + | Some x -> + Some x + | None when not (String.is_suffix ~affix:suffix txt) -> + None + | None -> + Some Int64.(mul multiplier (of_string (prefix suffix txt))) + ) + None sizes + with + | None -> + `Ok (Int64.of_string txt) + | Some x -> + `Ok x + with Failure _ -> `Error ("invalid size: " ^ txt) + +let size_printer ppf v = + let txt = + match + List.fold_left + (fun acc (suffix, multiplier) -> + match acc with + | Some x -> + Some x + | None when Int64.rem v multiplier = 0L -> + Some Int64.(to_string (div v multiplier) ^ suffix) + | None -> + None + ) + None sizes + with + | None -> + Int64.to_string v + | Some x -> + x + in + Format.fprintf ppf "%s" txt + +let size_converter = (size_parser, size_printer) + +let size = + let doc = Printf.sprintf "Virtual size of the qcow image" in + Arg.(value & opt size_converter 1024L & info ["size"] ~doc) + +let output_parser txt = + match String.Ascii.lowercase txt with + | "text" -> + `Ok `Text + | "json" -> + `Ok `Json + | _ -> + `Error "Unknown output format, expected either 'text' or 'json'" + +let output_printer ppf v = + Format.fprintf ppf "%s" (match v with `Text -> "text" | `Json -> "json") + +let output_converter = (output_parser, output_printer) + +let output_format = + let doc = "Desired output format" in + Arg.(value & opt output_converter `Text & info ["output"] ~doc) + +let strict_refcounts = + let doc = Printf.sprintf "Use strict (non-lazy) refcounts" in + Arg.(value & flag & info ["strict-refcounts"] ~doc) + +let output = + let doc = Printf.sprintf "Path to the output file." in + Arg.(value & pos 0 string "test.raw" & info [] ~doc) + +let trace = + let doc = Printf.sprintf "Print block device accesses for debugging" in + Arg.(value & flag & info ["trace"] ~doc) + +let ignore_data_loss = + let doc = + Printf.sprintf + "Ignore potential data loss and proceed anyway. Use with extreme caution!" + in + Arg.(value & flag & info ["ignore-data-loss"] ~doc) + +let ignore_zeroes = + let doc = "Scan for and ignore blocks which are full of zeroes" in + Arg.(value & flag & info ["ignore-zeroes"] ~doc) + +let filter = + let doc = "Path within the structure" in + Arg.(value & opt (some string) None & info ["filter"] ~doc) + +let info_cmd = + let doc = "display general information about a qcow2" in + let man = + [ + `S "DESCRIPTION" + ; `P "Display the contents of a qcow2 file header." + ; `P + "By default the full header is printed as an s-expression. To print \ + only some fields provide a --filter argument." + ; `S "EXAMPLES" + ; `P "To print the file size:" + ; `P "$(mname) info --filter .size" + ; `P "To print the dirty flag:" + ; `P "$(mname) info --filter .additional.[0].dirty" + ] + @ help + in + ( Term.(ret (const Impl.info $ filename $ filter)) + , Cmd.info "info" ~sdocs:_common_options ~doc ~man + ) + +let check_cmd = + let doc = "check the device for internal consistency" in + let man = + [ + `S "DESCRIPTION" + ; `P "Scan through the device and check for internal consistency" + ] + @ help + in + ( Term.(ret (const Impl.check $ filename)) + , Cmd.info "check" ~sdocs:_common_options ~doc ~man + ) + +let decode_cmd = + let doc = "decode qcow2 formatted data and write a raw image" in + let man = + [ + `S "DESCRIPTION"; `P "Decode qcow2 formatted data and write to a raw file." + ] + @ help + in + ( Term.(ret (const Impl.decode $ filename $ output)) + , Cmd.info "decode" ~sdocs:_common_options ~doc ~man + ) + +let encode_cmd = + let doc = "Convert the file from raw to qcow2" in + let man = [`S "DESCRIPTION"; `P "Convert a raw file to qcow2 ."] @ help in + ( Term.(ret (const Impl.encode $ filename $ output)) + , Cmd.info "encode" ~sdocs:_common_options ~doc ~man + ) + +let create_cmd = + let doc = "create a qcow-formatted data file" in + let man = [`S "DESCRIPTION"; `P "Create a qcow-formatted data file"] @ help in + ( Term.(ret (const Impl.create $ size $ strict_refcounts $ trace $ output)) + , Cmd.info "create" ~sdocs:_common_options ~doc ~man + ) + +let resize_cmd = + let doc = "Change the maximum virtual size of the disk." in + let man = + [ + `S "DESCRIPTION" + ; `P + "When a .qcow2 file is created, the physical file on disk is small but \ + the disk has a (usually much larger) 'virtual' size as seen from the \ + perspective of the client. A disk can usually be safely increased in \ + size without harming the contents. It's up to the client whether it \ + is able to use the new space or not." + ] + @ help + in + ( Term.(ret (const Impl.resize $ trace $ filename $ size $ ignore_data_loss)) + , Cmd.info "resize" ~sdocs:_common_options ~doc ~man + ) + +let unsafe_buffering = + let doc = + Printf.sprintf + "Run faster by caching writes in memory. A failure in the middle could \ + corrupt the file." + in + Arg.(value & flag & info ["unsafe-buffering"] ~doc) + +let discard_cmd = + let doc = "Scan for zeroes and discard them" in + let man = + [ + `S "DESCRIPTION" + ; `P + "Iterate over all allocated blocks in the image, and if a block only \ + contains zeroes, then invoke discard (aka TRIM or UNMAP) on it. This \ + helps shrink the blocks in the file." + ] + @ help + in + ( Term.(ret (const Impl.discard $ unsafe_buffering $ filename)) + , Cmd.info "discard" ~sdocs:_common_options ~doc ~man + ) + +let compact_cmd = + let doc = "Compact the file" in + let man = + [ + `S "DESCRIPTION" + ; `P + "Iterate over all the unallocated blocks ('holes') in the file created \ + by discard and move live data into them to shrink the file." + ] + @ help + in + ( Term.( + ret (const Impl.compact $ common_options_t $ unsafe_buffering $ filename) + ) + , Cmd.info "compact" ~sdocs:_common_options ~doc ~man + ) + +let repair_cmd = + let doc = "Regenerate the refcount table in an image" in + let man = + [ + `S "DESCRIPTION" + ; `P + "Regenerate the refcount table in an image to make it compliant with \ + the spec. We normally avoid updating the refcount at runtime as a \ + performance optimisation." + ] + @ help + in + ( Term.(ret (const Impl.repair $ unsafe_buffering $ filename)) + , Cmd.info "repair" ~sdocs:_common_options ~doc ~man + ) + +let sector = + let doc = Printf.sprintf "Virtual sector within the qcow2 image" in + Arg.(value & opt int64 0L & info ["sector"] ~doc) + +let text = + let doc = Printf.sprintf "Test to write into the qcow2 image" in + Arg.(value & opt string "" & info ["text"] ~doc) + +let write_cmd = + let doc = "Write a string to a virtual address in a qcow2 image" in + let man = + [ + `S "DESCRIPTION" + ; `P "Write a string at a given virtual sector offset in the qcow2 image." + ] + @ help + in + ( Term.(ret (const Impl.write $ filename $ sector $ text $ trace)) + , Cmd.info "write" ~sdocs:_common_options ~doc ~man + ) + +let length = + let doc = Printf.sprintf "Length of the data in 512-byte sectors" in + Arg.(value & opt int64 1L & info ["length"] ~doc) + +let read_cmd = + let doc = "Read a string from a virtual address in a qcow2 image" in + let man = + [ + `S "DESCRIPTION" + ; `P "Read a string at a given virtual sector offset in the qcow2 image." + ] + @ help + in + ( Term.(ret (const Impl.read $ filename $ sector $ length $ trace)) + , Cmd.info "read" ~sdocs:_common_options ~doc ~man + ) + +let mapped_cmd = + let doc = "Output a list of allocated extents, which may contain writes" in + let man = + [ + `S "DESCRIPTION" + ; `P + "When a .qcow2 file is created, it is guaranteed to be full of zeroes. \ + As data is written to the virtual disk, metadata is updated on the \ + physical file which allows us to list the regions which have been \ + written to." + ] + @ help + in + ( Term.(ret (const Impl.mapped $ filename $ output_format $ ignore_zeroes)) + , Cmd.info "mapped" ~sdocs:_common_options ~doc ~man + ) + +let pattern_number = + let doc = Printf.sprintf "Pattern number to write" in + Arg.(value & opt int 1 & info ["pattern"] ~doc) + +let pattern_cmd = + let doc = "Generate a .qcow2 with a test pattern" in + let man = + [ + `S "DESCRIPTION" + ; `P "Create a qcow2 file with a test pattern." + ; `P + "Pattern 1: write to every other cluster to stress the metadata \ + datastructure." + ; `P + "Pattern 2: write to the whole disk and then discard every other \ + cluster to produce the worst case for compaction." + ] + @ help + in + ( Term.( + ret + (const Impl.pattern + $ common_options_t + $ trace + $ output + $ size + $ pattern_number + ) + ) + , Cmd.info "pattern" ~sdocs:_common_options ~doc ~man + ) + +let sha_cmd = + let doc = "Compute a SHA1 from the contents of a qcow2" in + let man = + [ + `S "DESCRIPTION" + ; `P + "This is equivalent to decoding the qcow2 to a raw file and running \ + sha1sum." + ] + @ help + in + ( Term.(ret (const Impl.sha $ common_options_t $ filename)) + , Cmd.info "sha" ~sdocs:_common_options ~doc ~man + ) + +let dehydrate_cmd = + let doc = "Extract only the metadata blocks for debugging" in + let man = + [ + `S "DESCRIPTION" + ; `P + "Create 2 files: one containing metadata blocks and the second \ + containing a map of block to physical offset in the file. When \ + rehydrated the resulting file has the same structure as the original, \ + but with none of the data. It is therefore safe to share the \ + dehydrated file with other people without fearing data leaks. " + ; `P "To dehydrate a file input.qcow2 and produce dehydrated.{map,meta}:" + ; `P "qcow-tool dehydrate input.qcow2 dehydrated" + ] + @ help + in + let output = + let doc = Printf.sprintf "Prefix of the output files" in + Arg.(value & pos 1 string "dehydrated" & info [] ~doc) + in + ( Term.(ret (const Impl.dehydrate $ common_options_t $ filename $ output)) + , Cmd.info "dehydrate" ~sdocs:_common_options ~doc ~man + ) + +let rehydrate_cmd = + let doc = "Create a qcow2 file from a previously dehydrated file" in + let man = + [ + `S "DESCRIPTION" + ; `P + "Convert the files created by a previous call to dehydrate into a \ + valid qcow file which has the same structure as the original, but \ + with none of the data." + ; `P "To rehydrate files dehydrated.{map,meta} into output.qcow2:" + ; `P "qcow-tool rehydrate dehydrated output.qcow2" + ] + @ help + in + let filename = + let doc = Printf.sprintf "Prefix of the input files" in + Arg.(value & pos 0 string "dehydrated" & info [] ~doc) + in + let output = + let doc = Printf.sprintf "Output qcow2 file" in + Arg.(value & pos 1 string "output.qcow2" & info [] ~doc) + in + ( Term.(ret (const Impl.rehydrate $ common_options_t $ filename $ output)) + , Cmd.info "rehydrate" ~sdocs:_common_options ~doc ~man + ) + +let stream_cmd = + let doc = "stream the contents of a virtual disk" in + let man = + [ + `S "DESCRIPTION" + ; `P + "Read the contents of a virtual disk from a source and write it to\n\ + \ a destination that is a qcow2 file." + ] + @ help + in + let source = + let doc = Printf.sprintf "The disk to be streamed" in + Arg.(value & opt string "stdin:" & info ["source"] ~doc) + in + ( Term.(ret (const Impl.stream $ common_options_t $ source $ output)) + , Cmd.info "stream" ~sdocs:_common_options ~doc ~man + ) + +let cmds = + [ + info_cmd + ; create_cmd + ; check_cmd + ; repair_cmd + ; encode_cmd + ; decode_cmd + ; write_cmd + ; read_cmd + ; mapped_cmd + ; resize_cmd + ; discard_cmd + ; compact_cmd + ; pattern_cmd + ; sha_cmd + ; dehydrate_cmd + ; rehydrate_cmd + ; stream_cmd + ] + |> List.map (fun (t, i) -> Cmd.v i t) + +let () = + let default = + Term.(ret (const (fun _ -> `Help (`Pager, None)) $ common_options_t)) + in + let doc = "manipulate virtual disks stored in qcow2 files" in + let man = help in + let info = + Cmd.info "qcow-tool" ~version:"1.0.0" ~sdocs:_common_options ~doc ~man + in + let cmd = Cmd.group ~default info cmds in + exit (Cmd.eval cmd) diff --git a/ocaml/qcow-tool/doc/TRIM.md b/ocaml/qcow-tool/doc/TRIM.md new file mode 100644 index 00000000000..dc65156fecf --- /dev/null +++ b/ocaml/qcow-tool/doc/TRIM.md @@ -0,0 +1,86 @@ +# Making the disk smaller + +A qcow-formatted disk allocates blocks on demand and the file grows as more +blocks are allocated. A block may be marked as free by a "TRIM" or "discard" +operation. To cause the file to shrink we use a concurrent GC, described in +this document. + +## Why is this so complicated? + +The simplest possible implementation of "TRIM" or "discard" is to call a +filesystem API to "punch" holes out of the file. This is possible on BSDs and +Linux but not on macOS. On macOS we must physically shuffle the blocks away +from the end to make the file smaller. The implementation is further complicated +by the need to avoid calling `fflush` too often as it's very slow and hurts +performance. + +## General approach + +We keep track of the set of unused ("discarded") "clusters" (clusters are blocks +in a qcow2 file) and the references from one cluster to another. We have 2 +significant pieces of code: + +- the block allocator: previously this was a pointer to the next cluster and + always extended the file. Now this allocates from a free list of blocks which + have been erased and flushed. +- the block GC: this is completely new and is responsible for maintaining a + reasonably-sized free list of blocks and performing compaction of the disk + by moving clusters from the end of the file to holes nearer the beginning. + +When there are new "discarded" clusters (called "junk" clusters in the code) +we first top up the free list used by the block allocator, before using the +rest to compact the file. This is because + +- if we're going to overwrite a block as part of a move, it's pointless to + first erase and flush it +- if we're compacting the file and the file is extended because the free list is + empty then the allocation will be from the end and the new cluster will need to be + moved before we can shrink the file -- better to get the cluster placement + right first time than to have to move it immediately. + +Since `fflush` is very expensive we try to amortise the cost over many block +copies/erases. If there is outstanding unflushed work we will call `fflush` +after 5s, unless the user calls it themselves. + +At all times we try to avoid blocking I/O from the client as this can lead to +timeouts (e.g. AHCI controller resets). + +## States of clusters + +Clusters within a file can be in any one of the following states: + +- referenced and in-use +- `junk`: these have been recently discarded +- `erased`: these have been erased, but the zeroes have not been flushed which + means it's unsafe to use them. If the computer crashes then the old data + could re-appear. This is particularly bad for metadata blocks because we need + them to contain zeroes (interpreted as NULL pointers i.e. unallocated clusters) +- `available`: these have been erased and flushed and are safe to reallocate +- `Copying`: these are being moved to another place on the disk +- `Copied`: these have been duplicated but not flushed +- `Flushed`: these have been duplicated and the duplicated data has been flushed. + It is now safe to change the pointer to them. +- `Referenced`: these have been duplicated, the duplicate has been flushed and + the pointer has been changed but this has not been flushed. The old cluster + still cannot be reused because the pointer update might be undone by a crash; + but at least new writes go to the new location (and would obviously be lost if + the pointer update was undone by a crash -- but this is ok, unflushed updates + can be lost). After the next flush the original cluster becomes `junk` and the + new cluster becomes referenced and in-use. + +## Locking + +The principles are: + +- clusters are updated atomically +- client I/O is not blocked on the output of the GC. The GC proceeds optimistically + and can be rolled back + +We have the following locks + +- a per-cluster read/write lock: this guarantees clusters are updated atomically + and protects access to the per-cluster move state. When a cluster is written + to, any in-progress move can be marked as cancelled. +- a global metadata mutex: this is held when following the (cached) metadata + pointers and prevents following a pointer and it being immediately invalidated + by the GC diff --git a/ocaml/qcow-tool/doc/dashboard.json b/ocaml/qcow-tool/doc/dashboard.json new file mode 100644 index 00000000000..7dcb8f80893 --- /dev/null +++ b/ocaml/qcow-tool/doc/dashboard.json @@ -0,0 +1,822 @@ +{ + "__inputs": [ + { + "name": "DS_MIRAGE", + "label": "Mirage", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.3.0-beta1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_MIRAGE}", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "Mirage_qcow_used", + "format": "time_series", + "intervalFactor": 2, + "metric": "Mirage_qcow_used", + "refId": "A", + "step": 120 + }, + { + "expr": "Mirage_qcow_size", + "format": "time_series", + "intervalFactor": 2, + "metric": "Mirage_qcow_size", + "refId": "B", + "step": 120 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Total space", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_MIRAGE}", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(Mirage_qcow_reads[5m])", + "format": "time_series", + "intervalFactor": 2, + "metric": "Mirage_qcow_reads", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Virtual disk read rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_MIRAGE}", + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(Mirage_qcow_writes[5m])", + "format": "time_series", + "intervalFactor": 2, + "metric": "Mirage_qcow_writes", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Virtual disk write rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_MIRAGE}", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(Mirage_qcow_discards[5m])", + "format": "time_series", + "intervalFactor": 2, + "metric": "Mirage_qcow_discards", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Virtual disk discard rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 266, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_MIRAGE}", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(Mirage_qcow_erased[5m])", + "format": "time_series", + "intervalFactor": 2, + "metric": "Mirage_qcow_erased", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Erased", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_MIRAGE}", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(Mirage_qcow_available[5m])", + "format": "time_series", + "intervalFactor": 2, + "metric": "Mirage_qcow_available", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Available for reallocation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 256, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_MIRAGE}", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(Mirage_qcow_copied[5m])", + "format": "time_series", + "intervalFactor": 2, + "metric": "Mirage_qcow_copied", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Copied", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_MIRAGE}", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(Mirage_qcow_flushed[5m])", + "format": "time_series", + "intervalFactor": 2, + "metric": "Mirage_qcow_flushed", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Flushed to disk", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_MIRAGE}", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(Mirage_qcow_referenced[5m])", + "format": "time_series", + "intervalFactor": 2, + "metric": "Mirage_qcow_referenced", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Referenced", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "2017-05-16T11:07:27.855Z", + "to": "2017-05-17T09:18:00.991Z" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Mirage", + "version": 0 +} \ No newline at end of file diff --git a/ocaml/qcow-tool/doc/prometheus.md b/ocaml/qcow-tool/doc/prometheus.md new file mode 100644 index 00000000000..23825e5de9a --- /dev/null +++ b/ocaml/qcow-tool/doc/prometheus.md @@ -0,0 +1,62 @@ +# Prometheus stats + +This library supports exposing disk statistics in "prometheus" format. + +In a client application, stats can be exposed by instantiating the +[Prometheus_app](https://github.com/mirage/prometheus/blob/master/app/prometheus_app.mli) +functor, see +[this example in the mirage/prometheus repo[(https://github.com/mirage/prometheus/blob/master/examples/example.ml) +or +[this example in the moby/hyperkit repo](https://github.com/moby/hyperkit/blob/70205a6d5143340299a679af259f70dfcd7cf8a4/src/lib/mirage_block_ocaml.ml#L188). + +Once exposed, stats can be gathered by an instance of [prometheus](https://prometheus.io) and +then rendered into dashboards by tools like [grafana](https://grafana.com). + +## Example + +Docker for Mac uses this qcow implementation and therefore has prometheus +support. First install the latest experimental version from the +[master branch](https://download-stage.docker.com/mac/master/Docker.dmg). + +Start the application once, and then shut it down again -- this will create +the initial configuration. + +Expose metrics on `0.0.0.0:9090` by: +``` +cd ~/Library/Containers/com.docker.docker/Data/database/ +git reset --hard +mkdir -p com.docker.driver.amd64-linux/disk +echo -n "tcp:9090" > com.docker.driver.amd64-linux/disk/stats +git add com.docker.driver.amd64-linux/disk/stats +git commit -s -m 'Expose stats on port 9090 on all interfaces' +``` + +Test the metrics are working by: +``` +curl http://localhost:9090/metrics +``` + +Download [prometheus.yml](https://raw.githubusercontent.com/mirage/ocaml-qcow/master/doc/prometheus.yml) + +Next run a prometheus server with: +``` +docker run -d -p 9091:9090 -v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml prom/prometheus +``` +There should now be a prometheus server on port 9091. If you browse http://localhost:9091 and +select the "Status" menu and then "Targets" you should see the target marked as "UP". + +Next run a grafana instance with: +``` +docker run -d --name=grafana -p 3000:3000 grafana/grafana +``` +Load http://localhost:3000/ in your browser, login with username "admin" and password "admin", +click "Add data source", fill in a name (e.g. "qcow"), set the type to "Prometheus", +change the URL to "http://localhost:9091", change the type to "direct" and click "Save & Test". +It should say "Success: Data source is working" + +Click on the Main menu, hover over "Dashboards" and select "Import". Import the +[dashboard.json](https://raw.githubusercontent.com/mirage/ocaml-qcow/master/doc/dashboard.json). + +Once sufficient data has been scraped, the dashboard should look like this: + +![screenshot](https://cloud.githubusercontent.com/assets/198586/26151381/7e53db66-3afa-11e7-8608-7ba015c49910.png) diff --git a/ocaml/qcow-tool/doc/prometheus.yml b/ocaml/qcow-tool/doc/prometheus.yml new file mode 100644 index 00000000000..45b5413e0ce --- /dev/null +++ b/ocaml/qcow-tool/doc/prometheus.yml @@ -0,0 +1,12 @@ +global: + scrape_interval: 5s # By default, scrape targets every 15 seconds. + +scrape_configs: + - job_name: 'qcow' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['192.168.65.1:9090'] + diff --git a/ocaml/qcow-tool/generator/dune b/ocaml/qcow-tool/generator/dune new file mode 100644 index 00000000000..9df6b5100e2 --- /dev/null +++ b/ocaml/qcow-tool/generator/dune @@ -0,0 +1,2 @@ +(executable + (name gen)) diff --git a/ocaml/qcow-tool/generator/gen.ml b/ocaml/qcow-tool/generator/gen.ml new file mode 100644 index 00000000000..57c0c833750 --- /dev/null +++ b/ocaml/qcow-tool/generator/gen.ml @@ -0,0 +1,22 @@ +let output_file = ref "lib/qcow_word_size.ml" + +let _ = + Arg.parse + [("-o", Arg.Set_string output_file, "output filename")] + (fun x -> + Printf.fprintf stderr "Unexpected argument: %s\n%!" x ; + exit 1 + ) + "Auto-detect the host word size" ; + + let oc = open_out !output_file in + ( match Sys.word_size with + | 64 -> + Printf.fprintf stderr "On a 64-bit machine so using 'int' for clusters\n" ; + output_string oc "module Cluster = Qcow_int\n" + | _ -> + Printf.fprintf stderr + "Not on a 64-bit machine to using 'int64' for clusters\n" ; + output_string oc "module Cluster = Qcow_int64\n" + ) ; + close_out oc diff --git a/ocaml/qcow-tool/lib/dune b/ocaml/qcow-tool/lib/dune new file mode 100644 index 00000000000..a2a00dd9160 --- /dev/null +++ b/ocaml/qcow-tool/lib/dune @@ -0,0 +1,26 @@ +(library + (name qcow) + (libraries + astring + cstruct + logs + lwt + mirage-block + mirage-block-unix + mirage-types.lwt + prometheus + io-page + sexplib + stdlib-shims + mirage-time fmt + ) + (wrapped false) + (preprocess + (pps ppx_sexp_conv) + ) +) + +(rule + (targets qcow_word_size.ml) + (action + (run ../generator/gen.exe -o %{targets}))) diff --git a/ocaml/qcow-tool/lib/qcow.ml b/ocaml/qcow-tool/lib/qcow.ml new file mode 100644 index 00000000000..0217f92631e --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow.ml @@ -0,0 +1,2585 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Result +open Qcow_types +module Error = Qcow_error +module Header = Qcow_header +module Virtual = Qcow_virtual +module Physical = Qcow_physical +module Locks = Qcow_locks +module Cstructs = Qcow_cstructs +module Int = Qcow_int +module Int64 = Qcow_types.Int64 + +let ( <| ) = Int64.shift_left + +let ( |> ) = Int64.shift_right_logical + +let src = + let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in + Logs.Src.set_level src (Some Logs.Info) ; + src + +module Log = (val Logs.src_log src : Logs.LOG) + +module DebugSetting = struct let compact_mid_write = ref false end + +open Prometheus + +module Metrics = struct + let namespace = "Mirage" + + let subsystem = "qcow" + + let label_name = "id" + + let reads = + let help = "Number of bytes read" in + Counter.v_label ~label_name ~help ~namespace ~subsystem "reads" + + let writes = + let help = "Number of bytes written" in + Counter.v_label ~label_name ~help ~namespace ~subsystem "writes" + + let discards = + let help = "Number of bytes discarded" in + Counter.v_label ~label_name ~help ~namespace ~subsystem "discards" +end + +module Make (Base : Qcow_s.RESIZABLE_BLOCK) (Time : Mirage_time.S) = struct + (* samoht: `Msg should be the list of all possible exceptions *) + type error = [Mirage_block.error | `Msg of string] + + module Lwt_error = Qcow_error.Lwt_error + + (* samoht: `Msg should be the list of all possible exceptions *) + type write_error = [Mirage_block.write_error | `Msg of string] + + module Lwt_write_error = Qcow_error.Lwt_write_error + + let pp_error ppf = function + | #Mirage_block.error as e -> + Mirage_block.pp_error ppf e + | `Msg s -> + Fmt.string ppf s + + let pp_write_error ppf = function + | #Mirage_block.write_error as e -> + Mirage_block.pp_write_error ppf e + | `Msg s -> + Fmt.string ppf s + + module Config = Qcow_config + + (* Qemu-img will 'allocate' the last cluster by writing only the last sector. + Cope with this by assuming all later sectors are full of zeroes *) + module B = Qcow_padded.Make (Base) + + (* Run all threads in parallel, wait for all to complete, then iterate through + the results and return the first failure we discover. *) + let iter_p f xs = + let threads = List.map f xs in + Lwt_list.fold_left_s + (fun acc t -> + match acc with + | Error x -> + Lwt.return (Error x) (* first error wins *) + | Ok () -> + t + ) + (Ok ()) threads + + module Cache = Qcow_cache + module Recycler = Qcow_recycler.Make (B) (Time) + module Metadata = Qcow_metadata + + module Stats = struct + type t = {mutable nr_erased: int64; mutable nr_unmapped: int64} + + let zero = {nr_erased= 0L; nr_unmapped= 0L} + end + + type t = { + mutable h: Header.t + ; base: B.t + ; config: Config.t + ; info: Mirage_block.info + ; cache: Cache.t + ; locks: Locks.t + ; recycler: Recycler.t + ; metadata: Metadata.t + ; (* for convenience *) + cluster_bits: int + ; sector_size: int + ; mutable lazy_refcounts: bool + (* true if we are omitting refcounts right now *) + ; mutable stats: Stats.t + ; mutable cluster_map: Qcow_cluster_map.t + (* a live map of the allocated storage *) + ; cluster_map_m: Lwt_mutex.t + } + + let get_info t = Lwt.return t.info + + let to_config t = t.config + + let get_stats t = t.stats + + let malloc t = + let cluster_bits = Int32.to_int t.Header.cluster_bits in + let npages = max 1 (cluster_bits lsl (cluster_bits - 12)) in + let pages = Io_page.(to_cstruct (get npages)) in + Cstruct.sub pages 0 (1 lsl cluster_bits) + + (* Mmarshal a disk physical address written at a given offset within the disk. *) + let marshal_physical_address ?client t offset v = + let cluster = Physical.cluster ~cluster_bits:t.cluster_bits offset in + Metadata.update ?client t.metadata cluster (fun c -> + let addresses = Metadata.Physical.of_contents c in + let within = + Physical.within_cluster ~cluster_bits:t.cluster_bits offset + in + try + Metadata.Physical.set addresses within v ; + Lwt.return (Ok ()) + with e -> Lwt.fail e + ) + + (* Unmarshal a disk physical address written at a given offset within the disk. *) + let unmarshal_physical_address ?client t offset = + let cluster = Physical.cluster ~cluster_bits:t.cluster_bits offset in + let open Lwt_error.Infix in + Metadata.read_and_lock ?client t.metadata cluster >>= fun (c, lock) -> + let addresses = Metadata.Physical.of_contents c in + let within = Physical.within_cluster ~cluster_bits:t.cluster_bits offset in + Lwt.return (Ok (Metadata.Physical.get addresses within, lock)) + + let adapt_error : B.error -> error = function + | #Mirage_block.error as e -> + e + | _ -> + `Msg "Unknown error" + + let adapt_write_error : B.write_error -> write_error = function + | #Mirage_block.write_error as e -> + e + | _ -> + `Msg "Unknown error" + + let adapt_write_error_result = function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok x -> + Lwt.return_ok x + + let update_header t h = + let cluster = malloc t.h in + match Header.write h cluster with + | Result.Ok _ -> ( + let open Lwt.Infix in + B.write t.base 0L [cluster] >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> ( + Recycler.flush t.recycler >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> + Log.debug (fun f -> f "Written header") ; + t.h <- h ; + Lwt.return (Ok ()) + ) + ) + | Result.Error (`Msg m) -> + Lwt.return (Error (`Msg m)) + + let resize_base base sector_size cluster_map new_size = + let sector, within = Physical.to_sector ~sector_size new_size in + if within <> 0 then + Lwt.return + (Error + (`Msg + (Printf.sprintf + "Internal error: attempting to resize to a non-sector multiple \ + %s" + (Physical.to_string new_size) + ) + ) + ) + else + let open Lwt.Infix in + ( match cluster_map with + | Some (cluster_map, cluster_bits) -> + let cluster = Physical.cluster ~cluster_bits new_size in + Qcow_cluster_map.resize cluster_map cluster + | None -> + () + ) ; + B.resize base sector >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> + Log.debug (fun f -> + f "Resized device to %d bytes" (Qcow_physical.to_bytes new_size) + ) ; + Lwt.return (Ok ()) + + module ClusterIO = struct + (** Allocate [n] clusters and registers them as new roots in the cluster map + where [set] is a a set of possibly non-contiguous physical clusters which + are guaranteed to contain zeroes. + + This must be called via Locks.with_metadata_lock, to prevent + a parallel thread allocating another cluster for the same purpose. + This also prevents the recycling thread from resizing the file + concurrently. + *) + let allocate_clusters t n = + let sectors_per_cluster = (1 lsl t.cluster_bits) / t.sector_size in + + let open Lwt.Infix in + B.get_info t.base >>= fun base_info -> + let open Lwt_write_error.Infix in + (* If there is junk beyond the last block because someone just discarded + something then truncate the file to erase it. *) + let last_block = Qcow_cluster_map.get_last_block t.cluster_map in + let last_file_block = + Cluster.of_int + (Int64.to_int base_info.Mirage_block.size_sectors + / sectors_per_cluster + - 1 + ) + in + assert (last_block <= last_file_block) ; + let rest_of_file = + if last_block = last_file_block then + Cluster.IntervalSet.empty + else + Cluster.IntervalSet.( + add (Interval.make (Cluster.succ last_block) last_file_block) empty + ) + in + ( if + Cluster.IntervalSet.( + not + @@ is_empty + @@ inter rest_of_file + @@ Qcow_cluster_map.Junk.get t.cluster_map + ) + then ( + Log.debug (fun f -> + f + "Allocator: there is junk after the last block %s, shrinking \ + file" + (Cluster.to_string last_block) + ) ; + let size_clusters_should_be = Cluster.to_int last_block + 1 in + let p = Physical.make (size_clusters_should_be lsl t.cluster_bits) in + let size_sectors = Physical.sector ~sector_size:t.sector_size p in + resize_base t.base t.sector_size + (Some (t.cluster_map, t.cluster_bits)) + p + >>= fun () -> + Log.debug (fun f -> + f "Resized file to %d clusters (%Ld sectors)" + size_clusters_should_be size_sectors + ) ; + Lwt.return (Ok size_sectors) + ) else + Lwt.return (Ok base_info.Mirage_block.size_sectors) + ) + >>= fun size_sectors -> + let limit = 256 in + (* 16 MiB *) + let quantum = 512 in + + (* 32 MiB *) + let max_cluster_needed = Cluster.to_int last_block + n in + let len_cluster = Int64.to_int size_sectors / sectors_per_cluster in + let len_cluster_should_be = + if len_cluster - max_cluster_needed < limit then + len_cluster + quantum + else + len_cluster + in + (* keep it the same *) + ( if len_cluster_should_be <> len_cluster then ( + Log.info (fun f -> + f "Allocator: %s" + (Qcow_cluster_map.to_summary_string t.cluster_map) + ) ; + Log.info (fun f -> + f + "Allocator: file contains cluster 0 .. %d will enlarge file to \ + 0 .. %d" + (len_cluster - 1) + (len_cluster_should_be - 1) + ) ; + (* Resync the file size only *) + let p = Physical.make (len_cluster_should_be lsl t.cluster_bits) in + let size_sectors = Physical.sector ~sector_size:t.sector_size p in + resize_base t.base t.sector_size + (Some (t.cluster_map, t.cluster_bits)) + p + >>= fun () -> + Log.debug (fun f -> + f "Resized file to %d clusters (%Ld sectors)" + len_cluster_should_be size_sectors + ) ; + Lwt.return (Ok ()) + ) else + Lwt.return (Ok ()) + ) + >>= fun () -> + match Recycler.allocate t.recycler (Cluster.of_int n) with + | Some set -> + Log.debug (fun f -> f "Allocated %d clusters from free list" n) ; + Lwt.return (Ok set) + | None -> + assert false (* never happens because of the `resize_base` above *) + + module Refcount = struct + (* The refcount table contains pointers to clusters which themselves + contain the 2-byte refcounts *) + + let zero_all ?client t = + (* Zero all clusters allocated in the refcount table *) + let cluster = + Physical.cluster ~cluster_bits:t.cluster_bits + t.h.Header.refcount_table_offset + in + let refcount_table_clusters = + Int32.to_int t.h.Header.refcount_table_clusters + in + let rec loop i = + if i >= refcount_table_clusters then + Lwt.return (Ok ()) + else + (* `read` expects the function to be read-only, however we cheat and + perform write operations inside the read context *) + let open Lwt_error.Infix in + Metadata.read ?client t.metadata + Cluster.(add cluster (of_int i)) + (fun c -> + let addresses = Metadata.Physical.of_contents c in + let rec loop i = + if i >= Metadata.Physical.len addresses then + Lwt.return (Ok ()) + else + let open Lwt_write_error.Infix in + let addr = Metadata.Physical.get addresses i in + ( if Physical.to_bytes addr <> 0 then + let cluster = + Physical.cluster ~cluster_bits:t.cluster_bits addr + in + Metadata.update ?client t.metadata cluster (fun c -> + Metadata.erase c ; Lwt.return (Ok ()) + ) + >>= fun () -> + let open Lwt.Infix in + Recycler.flush t.recycler >>= adapt_write_error_result + else + Lwt.return (Ok ()) + ) + >>= fun () -> loop (i + 1) + in + let open Lwt.Infix in + loop 0 >>= function + | Error `Disconnected -> + Lwt.return (Error `Disconnected) + | Error `Is_read_only -> + Lwt.return (Error (`Msg "Device is read only")) + | Error (`Msg m) -> + Lwt.return (Error (`Msg m)) + | Ok () -> + Lwt.return (Ok ()) + ) + >>= fun () -> loop (i + 1) + in + loop 0 + + let read ?client t cluster = + let cluster = Cluster.to_int64 cluster in + let within_table = + Int64.(div cluster (Header.refcounts_per_cluster t.h)) + in + let within_cluster = + Int64.(to_int (rem cluster (Header.refcounts_per_cluster t.h))) + in + + let offset = + Physical.add t.h.Header.refcount_table_offset + (8 * Int64.to_int within_table) + in + let open Lwt_error.Infix in + unmarshal_physical_address ?client t offset >>= fun (offset, lock) -> + Lwt.finalize + (fun () -> + if Physical.to_bytes offset = 0 then + Lwt.return (Ok 0) + else + let cluster = + Physical.cluster ~cluster_bits:t.cluster_bits offset + in + Metadata.read ?client t.metadata cluster (fun c -> + let refcounts = Metadata.Refcounts.of_contents c in + Lwt.return + (Ok (Metadata.Refcounts.get refcounts within_cluster)) + ) + ) + (fun () -> Locks.unlock lock ; Lwt.return_unit) + + (** Decrement the refcount of a given cluster. This will never need to allocate. + We never bother to deallocate refcount clusters which are empty. *) + let really_decr ?client t cluster = + let cluster = Cluster.to_int64 cluster in + let within_table = + Int64.(div cluster (Header.refcounts_per_cluster t.h)) + in + let within_cluster = + Int64.(to_int (rem cluster (Header.refcounts_per_cluster t.h))) + in + + let offset = + Physical.add t.h.Header.refcount_table_offset + (8 * Int64.to_int within_table) + in + let open Lwt_write_error.Infix in + unmarshal_physical_address ?client t offset >>= fun (offset, lock) -> + Lwt.finalize + (fun () -> + if Physical.to_bytes offset = 0 then ( + Log.err (fun f -> + f + "Refcount.decr: cluster %Ld has no refcount cluster \ + allocated" + cluster + ) ; + Lwt.return + (Error + (`Msg + (Printf.sprintf + "Refcount.decr: cluster %Ld has no refcount cluster \ + allocated" + cluster + ) + ) + ) + ) else + let cluster = + Physical.cluster ~cluster_bits:t.cluster_bits offset + in + Metadata.update ?client t.metadata cluster (fun c -> + let refcounts = Metadata.Refcounts.of_contents c in + let current = + Metadata.Refcounts.get refcounts within_cluster + in + if current = 0 then ( + Log.err (fun f -> + f + "Refcount.decr: cluster %s already has a refcount of \ + 0" + (Cluster.to_string cluster) + ) ; + Lwt.return + (Error + (`Msg + (Printf.sprintf + "Refcount.decr: cluster %s already has a \ + refcount of 0" + (Cluster.to_string cluster) + ) + ) + ) + ) else ( + Metadata.Refcounts.set refcounts within_cluster (current - 1) ; + Lwt.return (Ok ()) + ) + ) + ) + (fun () -> Locks.unlock lock ; Lwt.return_unit) + + (** Increment the refcount of a given cluster. Note this might need + to allocate itself, to enlarge the refcount table. When this function + returns the refcount is guaranteed to have been persisted. *) + let rec really_incr ?client t cluster = + let open Lwt_write_error.Infix in + let cluster = Cluster.to_int64 cluster in + let within_table = + Int64.(div cluster (Header.refcounts_per_cluster t.h)) + in + let within_cluster = + Int64.(to_int (rem cluster (Header.refcounts_per_cluster t.h))) + in + + (* If the table (containing pointers to clusters which contain the refcounts) + is too small, then reallocate it now. *) + let cluster_containing_pointer = + let within_table_offset = Int64.mul within_table 8L in + within_table_offset |> t.cluster_bits + in + let current_size_clusters = + Int64.of_int32 t.h.Header.refcount_table_clusters + in + ( if cluster_containing_pointer >= current_size_clusters then + let needed = Header.max_refcount_table_size t.h in + (* Make sure this is actually an increase: make the table 2x larger if not *) + let needed = + if needed = current_size_clusters then + Int64.mul 2L current_size_clusters + else + needed + in + allocate_clusters t (Int64.to_int needed) >>= fun free -> + Lwt.finalize + (fun () -> + (* Erasing new blocks is handled after the copy *) + (* Copy any existing refcounts into new table *) + let buf = malloc t.h in + let rec loop free i = + if i >= Int32.to_int t.h.Header.refcount_table_clusters then + Lwt.return (Ok ()) + else + let physical = + Physical.add t.h.Header.refcount_table_offset + (i lsl t.cluster_bits) + in + let src = + Physical.cluster ~cluster_bits:t.cluster_bits physical + in + let first = + Cluster.IntervalSet.(Interval.x (min_elt free)) + in + let physical = + Physical.make (Cluster.to_int first lsl t.cluster_bits) + in + let dst = + Physical.cluster ~cluster_bits:t.cluster_bits physical + in + let open Lwt.Infix in + Recycler.copy t.recycler src dst >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> + let free = + Cluster.IntervalSet.( + remove (Interval.make first first) free + ) + in + loop free (i + 1) + in + loop free 0 >>= fun () -> + Log.debug (fun f -> f "Copied refcounts into new table") ; + (* Zero new blocks *) + Cstruct.memset buf 0 ; + let rec loop free i = + if i >= needed then + Lwt.return (Ok ()) + else + let first = + Cluster.IntervalSet.(Interval.x (min_elt free)) + in + let physical = + Physical.make (Cluster.to_int first lsl t.cluster_bits) + in + let sector, _ = + Physical.to_sector ~sector_size:t.sector_size physical + in + let open Lwt.Infix in + B.write t.base sector [buf] >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> + let free = + Cluster.IntervalSet.( + remove (Interval.make first first) free + ) + in + loop free (Int64.succ i) + in + loop free (Int64.of_int32 t.h.Header.refcount_table_clusters) + >>= fun () -> + let first = Cluster.IntervalSet.(Interval.x (min_elt free)) in + let refcount_table_offset = + Physical.make (Cluster.to_int first lsl t.cluster_bits) + in + let h' = + { + t.h with + Header.refcount_table_offset + ; refcount_table_clusters= Int64.to_int32 needed + } + in + update_header t h' >>= fun () -> + (* increase the refcount of the clusters we just allocated *) + let rec loop free i = + if i >= needed then + Lwt.return (Ok ()) + else + let first = + Cluster.IntervalSet.(Interval.x (min_elt free)) + in + really_incr ?client t first >>= fun () -> + let free = + Cluster.IntervalSet.( + remove (Interval.make first first) free + ) + in + loop free (Int64.succ i) + in + loop free 0L + ) + (fun () -> + Qcow_cluster_map.Roots.remove t.cluster_map free ; + Lwt.return_unit + ) + else + Lwt.return (Ok ()) + ) + >>= fun () -> + let offset = + Physical.add t.h.Header.refcount_table_offset + (8 * Int64.to_int within_table) + in + unmarshal_physical_address ?client t offset >>= fun (addr, lock) -> + Lwt.finalize + (fun () -> + ( if Physical.to_bytes addr = 0 then + allocate_clusters t 1 >>= fun free -> + Lwt.finalize + (fun () -> + let cluster = + Cluster.IntervalSet.(Interval.x (min_elt free)) + in + (* NB: the pointers in the refcount table are different from the pointers + in the cluster table: the high order bits are not used to encode extra + information and wil confuse qemu/qemu-img. *) + let addr = + Physical.make (Cluster.to_int cluster lsl t.cluster_bits) + in + (* zero the cluster *) + let buf = malloc t.h in + Cstruct.memset buf 0 ; + let sector, _ = + Physical.to_sector ~sector_size:t.sector_size addr + in + let open Lwt.Infix in + B.write t.base sector [buf] >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> ( + (* Ensure the new zeroed cluster has been persisted before we reference + it via `marshal_physical_address` *) + Recycler.flush t.recycler + >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> ( + Log.debug (fun f -> + f "Allocated new refcount cluster %s" + (Cluster.to_string cluster) + ) ; + let open Lwt_write_error.Infix in + marshal_physical_address ?client t offset addr + >>= fun () -> + let open Lwt.Infix in + Recycler.flush t.recycler >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> + let open Lwt_write_error.Infix in + really_incr ?client t cluster >>= fun () -> + Lwt.return (Ok addr) + ) + ) + ) + (fun () -> + Qcow_cluster_map.Roots.remove t.cluster_map free ; + Lwt.return_unit + ) + else + Lwt.return (Ok addr) + ) + >>= fun offset -> + let refcount_cluster = + Physical.cluster ~cluster_bits:t.cluster_bits offset + in + Metadata.update ?client t.metadata refcount_cluster (fun c -> + let refcounts = Metadata.Refcounts.of_contents c in + let current = Metadata.Refcounts.get refcounts within_cluster in + (* We don't support refcounts of more than 1 *) + assert (current == 0) ; + Metadata.Refcounts.set refcounts within_cluster (current + 1) ; + Lwt.return (Ok ()) + ) + ) + (fun () -> Locks.unlock lock ; Lwt.return_unit) + >>= fun () -> + let open Lwt.Infix in + Recycler.flush t.recycler >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> + Log.debug (fun f -> f "Incremented refcount of cluster %Ld" cluster) ; + Lwt.return (Ok ()) + + (* If the lazy refcounts feature is enabled then don't actually Increment + the refcounts. *) + let incr ?client t cluster = + if t.lazy_refcounts then + Lwt.return (Ok ()) + else + really_incr ?client t cluster + + let decr ?client t cluster = + if t.lazy_refcounts then + Lwt.return (Ok ()) + else + really_decr ?client t cluster + end + + let read_l1_table ?client t l1_index = + (* Read l1[l1_index] as a 64-bit offset *) + let l1_index_offset = + Physical.shift t.h.Header.l1_table_offset (8 * Int64.to_int l1_index) + in + unmarshal_physical_address ?client t l1_index_offset + + (* Find the first l1_index whose values satisfies [f] *) + let find_mapped_l1_table t l1_index = + let open Lwt_error.Infix in + (* Read l1[l1_index] as a 64-bit offset *) + let rec loop l1_index = + if l1_index >= Int64.of_int32 t.h.Header.l1_size then + Lwt.return (Ok None) + else + let l1_index_offset = + Physical.shift t.h.Header.l1_table_offset (8 * Int64.to_int l1_index) + in + + let cluster = + Physical.cluster ~cluster_bits:t.cluster_bits l1_index_offset + in + + Metadata.read t.metadata cluster (fun c -> + let addresses = Metadata.Physical.of_contents c in + let within = + Physical.within_cluster ~cluster_bits:t.cluster_bits + l1_index_offset + in + let rec loop l1_index i : [`Skip of int | `GotOne of int64] = + if i >= Metadata.Physical.len addresses then + `Skip i + else if Metadata.Physical.get addresses i <> Physical.unmapped + then + `GotOne l1_index + else + loop (Int64.succ l1_index) (i + 1) + in + Lwt.return (Ok (loop l1_index within)) + ) + >>= function + | `GotOne l1_index' -> + Lwt.return (Ok (Some l1_index')) + | `Skip n -> + loop Int64.(add l1_index (of_int n)) + in + loop l1_index + + let write_l1_table ?client t l1_index l2_table_offset = + let open Lwt_write_error.Infix in + (* Always set the mutable flag *) + let l2_table_offset = + if l2_table_offset = Physical.unmapped then + Physical.unmapped (* don't set metadata bits for unmapped clusters *) + else + Physical.make ~is_mutable:true (Physical.to_bytes l2_table_offset) + in + (* Write l1[l1_index] as a 64-bit offset *) + let l1_index_offset = + Physical.shift t.h.Header.l1_table_offset (8 * Int64.to_int l1_index) + in + marshal_physical_address ?client t l1_index_offset l2_table_offset + >>= fun () -> + Log.debug (fun f -> + f "Written l1_table[%Ld] <- %s" l1_index + (Cluster.to_string + @@ Physical.cluster ~cluster_bits:t.cluster_bits l2_table_offset + ) + ) ; + Lwt.return (Ok ()) + + let read_l2_table ?client t l2_table_offset l2_index = + let l2_index_offset = + Physical.shift l2_table_offset (8 * Int64.to_int l2_index) + in + unmarshal_physical_address ?client t l2_index_offset + + let write_l2_table ?client t l2_table_offset l2_index cluster = + let open Lwt_write_error.Infix in + (* Always set the mutable flag *) + let cluster = + if cluster = Physical.unmapped then + Physical.unmapped (* don't set metadata bits for unmapped clusters *) + else + Physical.make ~is_mutable:true (Physical.to_bytes cluster) + in + let l2_index_offset = + Physical.shift l2_table_offset (8 * Int64.to_int l2_index) + in + marshal_physical_address ?client t l2_index_offset cluster >>= fun _ -> + Log.debug (fun f -> + f "Written l2_table[%Ld] <- %s" l2_index + (Cluster.to_string + @@ Physical.cluster ~cluster_bits:t.cluster_bits cluster + ) + ) ; + Lwt.return (Ok ()) + + (* Walk the L1 and L2 tables to translate an address. If a table entry + is unallocated then return [None]. Note if a [walk_and_allocate] is + racing with us then we may or may not see the mapping. *) + let walk_readonly ?client t a = + let open Lwt_error.Infix in + Locks.with_metadata_lock t.locks (fun () -> + read_l1_table ?client t a.Virtual.l1_index + >>= fun (l2_table_offset, l1_lock) -> + let ( >>|= ) m f = + let open Lwt in + m >>= function + | Error x -> + Lwt.return (Error x) + | Ok None -> + Lwt.return (Ok None) + | Ok (Some x) -> + f x + in + + (* Look up an L2 table *) + ( if Physical.to_bytes l2_table_offset = 0 then ( + Locks.unlock l1_lock ; Lwt.return (Ok None) + ) else ( + if Physical.is_compressed l2_table_offset then + failwith "compressed" ; + Lwt.return (Ok (Some l2_table_offset)) + ) + ) + >>|= fun l2_table_offset -> + (* Look up a cluster *) + read_l2_table ?client t l2_table_offset a.Virtual.l2_index + >>= fun (cluster_offset, l2_lock) -> + ( if Physical.to_bytes cluster_offset = 0 then ( + Locks.unlock l1_lock ; Locks.unlock l2_lock ; Lwt.return (Ok None) + ) else ( + if Physical.is_compressed cluster_offset then + failwith "compressed" ; + Lwt.return (Ok (Some cluster_offset)) + ) + ) + >>|= fun cluster_offset -> + let p = + Physical.shift cluster_offset (Int64.to_int a.Virtual.cluster) + in + Lwt.return (Ok (Some (p, l1_lock, l2_lock))) + ) + + (* Walk the L1 and L2 tables to translate an address, allocating missing + entries as we go. *) + let walk_and_allocate ?client t a = + let open Lwt_write_error.Infix in + Locks.with_metadata_lock t.locks (fun () -> + read_l1_table ?client t a.Virtual.l1_index + >>= fun (l2_offset, l1_lock) -> + (* If there is no L2 table entry then allocate L2 and data clusters + at the same time to minimise I/O *) + ( if Physical.to_bytes l2_offset = 0 then + allocate_clusters t 2 >>= fun free -> + Lwt.finalize + (fun () -> + (* FIXME: it's unnecessary to write to the data cluster if we're + about to overwrite it with real data straight away *) + let open Lwt_write_error.Infix in + let l2_cluster = + Cluster.IntervalSet.(Interval.x (min_elt free)) + in + let free = + Cluster.IntervalSet.( + remove (Interval.make l2_cluster l2_cluster) free + ) + in + let data_cluster = + Cluster.IntervalSet.(Interval.x (min_elt free)) + in + Refcount.incr t l2_cluster >>= fun () -> + Refcount.incr t data_cluster >>= fun () -> + let l2_offset = + Physical.make (Cluster.to_int l2_cluster lsl t.cluster_bits) + in + let data_offset = + Physical.make + (Cluster.to_int data_cluster lsl t.cluster_bits) + in + write_l2_table ?client t l2_offset a.Virtual.l2_index + data_offset + >>= fun () -> + read_l2_table ?client t l2_offset a.Virtual.l2_index + >>= fun (data_offset', l2_lock) -> + (* NB the new blocks can't be moved within the `allocate_clusters` callback + since they are registered as global roots *) + assert ( + Physical.to_bytes data_offset + = Physical.to_bytes data_offset' + ) ; + write_l1_table ?client t a.Virtual.l1_index l2_offset + >>= fun () -> Lwt.return (Ok (data_offset, l1_lock, l2_lock)) + ) + (fun () -> + Qcow_cluster_map.Roots.remove t.cluster_map free ; + Lwt.return_unit + ) + else + read_l2_table ?client t l2_offset a.Virtual.l2_index + >>= fun (data_offset, l2_lock) -> + if Physical.to_bytes data_offset = 0 then + allocate_clusters t 1 >>= fun free -> + Lwt.finalize + (fun () -> + let open Lwt_write_error.Infix in + let data_cluster = + Cluster.IntervalSet.(Interval.x (min_elt free)) + in + Refcount.incr t data_cluster >>= fun () -> + let data_offset = + Physical.make + (Cluster.to_int data_cluster lsl t.cluster_bits) + in + let open Lwt_write_error.Infix in + write_l2_table ?client t l2_offset a.Virtual.l2_index + data_offset + >>= fun () -> Lwt.return (Ok (data_offset, l1_lock, l2_lock)) + ) + (fun () -> + Qcow_cluster_map.Roots.remove t.cluster_map free ; + Lwt.return_unit + ) + else ( + if Physical.is_compressed data_offset then failwith "compressed" ; + Lwt.return (Ok (data_offset, l1_lock, l2_lock)) + ) + ) + >>= fun (data_offset, l1_lock, l2_lock) -> + let p = Physical.shift data_offset (Int64.to_int a.Virtual.cluster) in + Lwt.return (Ok (p, l1_lock, l2_lock)) + ) + + let walk_and_deallocate ?client t sector n = + let open Lwt_write_error.Infix in + let sectors_per_cluster = + Int64.(div (1L <| t.cluster_bits) (of_int t.sector_size)) + in + Locks.with_metadata_lock t.locks (fun () -> + let get_l2 sector = + let byte = + Int64.(mul sector (of_int t.info.Mirage_block.sector_size)) + in + let a = Virtual.make ~cluster_bits:t.cluster_bits byte in + read_l1_table ?client t a.Virtual.l1_index + >>= fun (l2_offset, l1_lock) -> + if Physical.to_bytes l2_offset = 0 then ( + Locks.unlock l1_lock ; Lwt.return (Ok None) + ) else + let l2_index_offset = + Physical.shift l2_offset (8 * Int64.to_int a.Virtual.l2_index) + in + let cluster = + Physical.cluster ~cluster_bits:t.cluster_bits l2_index_offset + in + let within = + Physical.within_cluster ~cluster_bits:t.cluster_bits + l2_index_offset + in + Lwt.return (Ok (Some ((cluster, within), l1_lock))) + in + let rec loop sector n = + if n = 0L then + Lwt.return (Ok ()) + else + (get_l2 sector >>= function + | None -> + (* FIXME: we can almost certainly jump more than this *) + Lwt.return (Ok sectors_per_cluster) + | Some ((cluster, _), l1_lock) -> + Lwt.finalize + (fun () -> + Metadata.update ?client t.metadata cluster (fun c -> + let addresses = Metadata.Physical.of_contents c in + (* With the cluster write lock held, complete as many + writes to it as we need, unlocking and writing it out + once at the end. *) + let rec inner acc sector n = + if n = 0L then + Lwt.return (Ok acc) + else + get_l2 sector >>= function + | None -> + Lwt.return (Ok acc) + | Some ((cluster', _), l1_lock) + when cluster <> cluster' -> + Locks.unlock l1_lock ; Lwt.return (Ok acc) + | Some ((_, within), l1_lock) -> + Locks.unlock l1_lock ; + (* still locked above *) + let data_offset = + Metadata.Physical.get addresses within + in + if Physical.to_bytes data_offset = 0 then + inner + (Int64.add acc sectors_per_cluster) + (Int64.add sector sectors_per_cluster) + (Int64.sub n sectors_per_cluster) + else + (* The data at [data_offset] is about to become an unreferenced + hole in the file *) + let current = + Metadata.Physical.get addresses within + in + ( if current <> Physical.unmapped then + Locks.Write.with_lock t.locks ?client + (Physical.cluster + ~cluster_bits:t.cluster_bits + current + ) (fun () -> + (* It's important to hold the write lock because we might + be about to erase or copy this block *) + Metadata.Physical.set addresses + within Physical.unmapped ; + t.stats.Stats.nr_unmapped <- + Int64.add + t.stats.Stats.nr_unmapped + sectors_per_cluster ; + Lwt.return (Ok ()) + ) + else + Lwt.return (Ok ()) + ) + >>= fun () -> + Refcount.decr t + (Physical.cluster + ~cluster_bits:t.cluster_bits + data_offset + ) + >>= fun () -> + inner + (Int64.add acc sectors_per_cluster) + (Int64.add sector sectors_per_cluster) + (Int64.sub n sectors_per_cluster) + in + inner 0L sector n + ) + ) + (fun () -> Locks.unlock l1_lock ; Lwt.return_unit) + ) + >>= fun to_advance -> + loop (Int64.add sector to_advance) (Int64.sub n to_advance) + in + loop sector n + ) + end + + (* Starting at byte offset [ofs], map a list of buffers onto a list of + [byte offset, buffer] pairs, where + - no [byte offset, buffer] pair crosses an [alignment] boundary; + - each [buffer] is as large as possible (so for example if we supply + one large buffer it will only be fragmented to the minimum extent. *) + let rec chop_into_aligned alignment ofs = function + | [] -> + [] + | buf :: bufs -> + (* If we're not aligned, sync to the next boundary *) + let into = Int64.(to_int (sub alignment (rem ofs alignment))) in + if Cstruct.length buf > into then + let this = (ofs, Cstruct.sub buf 0 into) in + let rest = + chop_into_aligned alignment + Int64.(add ofs (of_int into)) + (Cstruct.shift buf into :: bufs) + in + this :: rest + else + (ofs, buf) + :: chop_into_aligned alignment + Int64.(add ofs (of_int (Cstruct.length buf))) + bufs + + type work = { + sector: int64 (* starting sector of the operaiton *) + ; bufs: Cstruct.t list + ; metadata_locks: Locks.lock list + (* read locks on the metadata pointing to the physical clusters: our guarantee + that the target physical clusters haven't been moved and the references + rewritten *) + } + + (* Given a list of offset, buffer pairs for reading or writing, coalesce + adjacent offsets for readv/writev *) + let coalesce_into_adjacent sector_size = + let rec loop sector bufs metadata_locks next_sector acc = function + | [] -> + List.rev ({sector; bufs= List.rev bufs; metadata_locks} :: acc) + | work :: rest -> + let next_sector' = + Int64.( + add work.sector (of_int (Cstructs.len work.bufs / sector_size)) + ) + in + if next_sector = work.sector then + loop sector (work.bufs @ bufs) + (work.metadata_locks @ metadata_locks) + next_sector' acc rest + else + loop work.sector work.bufs work.metadata_locks next_sector' + ({sector; bufs= List.rev bufs; metadata_locks} :: acc) + rest + in + function + | [] -> + [] + | work :: rest -> + let next_sector' = + Int64.(add work.sector (of_int (Cstructs.len work.bufs / sector_size))) + in + loop work.sector work.bufs work.metadata_locks next_sector' [] rest + + exception Reference_outside_file of int64 * int64 + + let make_cluster_map t ?id () = + let open Qcow_cluster_map in + let sectors_per_cluster = + Int64.(div (1L <| t.cluster_bits) (of_int t.sector_size)) + in + let open Lwt.Infix in + B.get_info t.base >>= fun base_info -> + let max_cluster = + Cluster.of_int64 + @@ Int64.div base_info.Mirage_block.size_sectors sectors_per_cluster + in + (* Iterate over the all clusters referenced from all the tables in the file + and (a) construct a set of free clusters; and (b) construct a map of + physical cluster back to virtual. The free set will show us the holes, + and the map will tell us where to get the data from to fill the holes in + with. *) + let refs = ref Cluster.Map.empty in + + let refcount_start_cluster = + Cluster.to_int64 + @@ Physical.cluster ~cluster_bits:t.cluster_bits + t.h.Header.refcount_table_offset + in + let int64s_per_cluster = 1L <| Int32.to_int t.h.Header.cluster_bits - 3 in + let l1_table_start_cluster = + Cluster.to_int64 + @@ Physical.cluster ~cluster_bits:t.cluster_bits + t.h.Header.l1_table_offset + in + let l1_table_clusters = + Int64.( + div + (round_up (of_int32 t.h.Header.l1_size) int64s_per_cluster) + int64s_per_cluster + ) + in + (* Assume all clusters are free. Note when the file is sparse we can exceed the max + possible cluster. This is only a sanity check to catch crazily-wrong inputs. *) + let cluster_size = 1L <| t.cluster_bits in + let max_possible_cluster = + Cluster.of_int64 + (Int64.round_up t.h.Header.size cluster_size |> t.cluster_bits) + in + let free = + Qcow_bitmap.make_full + ~initial_size:(Cluster.to_int max_cluster) + ~maximum_size:(Cluster.to_int max_possible_cluster * 50) + in + (* The header structures are untracked by the qcow_cluster_map and we assume + they don't move and we don't try to move them. We assume the structures + have no holes in them, otherwise we would miscompute the `first_movable_cluster` + and accidentally truncate the file. *) + Qcow_bitmap.( + remove + (Interval.make 0L + Int64.(pred @@ add l1_table_start_cluster l1_table_clusters) + ) + free + ) ; + Qcow_bitmap.( + remove + (Interval.make 0L + Int64.( + pred + @@ add refcount_start_cluster + (Int64.of_int32 t.h.Header.refcount_table_clusters) + ) + ) + free + ) ; + Qcow_bitmap.(remove (Interval.make 0L 0L) free) ; + let first_movable_cluster = + try Cluster.of_int64 @@ Qcow_bitmap.min_elt free + with Not_found -> max_cluster (* header takes up the whole file *) + in + let parse x = + if x = Physical.unmapped then + Cluster.zero + else + Physical.cluster ~cluster_bits:t.cluster_bits x + in + + let mark rf cluster = + let c, w = rf in + if cluster > max_cluster then ( + Log.err (fun f -> + f + "Found a reference to cluster %s outside the file (max cluster \ + %s) from cluster %s.%d" + (Cluster.to_string cluster) + (Cluster.to_string max_cluster) + (Cluster.to_string c) w + ) ; + let src = + Int64.add (Int64.of_int w) + (Cluster.to_int64 c <| Int32.to_int t.h.Header.cluster_bits) + in + let dst = + Cluster.to_int64 cluster <| Int32.to_int t.h.Header.cluster_bits + in + raise (Reference_outside_file (src, dst)) + ) ; + let c, w = rf in + if cluster = Cluster.zero then + () + else ( + if Cluster.Map.mem cluster !refs then ( + let c', w' = Cluster.Map.find cluster !refs in + Log.err (fun f -> + f "Found two references to cluster %s: %s.%d and %s.%d" + (Cluster.to_string cluster) + (Cluster.to_string c) w (Cluster.to_string c') w' + ) ; + raise + (Error.Duplicate_reference + ( (Cluster.to_int64 c, w) + , (Cluster.to_int64 c', w') + , Cluster.to_int64 cluster + ) + ) + ) ; + Qcow_bitmap.( + remove + (Interval.make (Cluster.to_int64 cluster) (Cluster.to_int64 cluster)) + free + ) ; + refs := Cluster.Map.add cluster rf !refs + ) + in + + (* scan the refcount table *) + let open Lwt_error.Infix in + let rec loop i = + if i >= Int64.of_int32 t.h.Header.refcount_table_clusters then + Lwt.return (Ok ()) + else + let refcount_cluster = + Cluster.of_int64 @@ Int64.(add refcount_start_cluster i) + in + Metadata.read t.metadata refcount_cluster (fun c -> + let addresses = Metadata.Physical.of_contents c in + let rec loop i = + if i >= Metadata.Physical.len addresses then + Lwt.return (Ok ()) + else + let cluster = parse (Metadata.Physical.get addresses i) in + mark (refcount_cluster, i) cluster ; + loop (i + 1) + in + loop 0 + ) + >>= fun () -> loop (Int64.succ i) + in + loop 0L >>= fun () -> + (* scan the L1 and L2 tables, marking the L2 and data clusters *) + let rec l1_iter i = + let l1_table_cluster = + Cluster.of_int64 @@ Int64.(add l1_table_start_cluster i) + in + if i >= l1_table_clusters then + Lwt.return (Ok ()) + else + Metadata.read t.metadata l1_table_cluster (fun c -> + let l1 = Metadata.Physical.of_contents c in + Lwt.return (Ok l1) + ) + >>= fun l1 -> + let rec l2_iter i = + if i >= Metadata.Physical.len l1 then + Lwt.return (Ok ()) + else + let l2_table_cluster = parse (Metadata.Physical.get l1 i) in + if l2_table_cluster <> Cluster.zero then ( + mark (l1_table_cluster, i) l2_table_cluster ; + Metadata.read t.metadata l2_table_cluster (fun c -> + let l2 = Metadata.Physical.of_contents c in + Lwt.return (Ok l2) + ) + >>= fun l2 -> + let rec data_iter i = + if i >= Metadata.Physical.len l2 then + Lwt.return (Ok ()) + else + let cluster = parse (Metadata.Physical.get l2 i) in + mark (l2_table_cluster, i) cluster ; + data_iter (i + 1) + in + data_iter 0 >>= fun () -> l2_iter (i + 1) + ) else + l2_iter (i + 1) + in + l2_iter 0 >>= fun () -> l1_iter (Int64.succ i) + in + l1_iter 0L >>= fun () -> + let map = + make ~free ~refs:!refs ~first_movable_cluster ~cache:t.cache + ~runtime_asserts:t.config.Config.runtime_asserts ~id + ~cluster_size:(Int64.to_int cluster_size) + in + + Lwt.return (Ok map) + + type check_result = {free: int64; used: int64} + + type compact_result = { + copied: int64 + ; refs_updated: int64 + ; old_size: int64 + ; new_size: int64 + } + + let compact t ?(progress_cb = fun ~percent:_ -> ()) () = + if t.config.Config.read_only then + Lwt.return (Error `Is_read_only) + else + (* We will return a cancellable task to the caller, and on cancel we will + set the cancel_requested flag. The main compact loop will detect this + and complete the moves already in progress before returning. *) + let cancel_requested = ref false in + + let th, u = Lwt.task () in + Lwt.on_cancel th (fun () -> + Log.info (fun f -> f "cancellation of compact requested") ; + cancel_requested := true + ) ; + (* Catch stray exceptions and return as unknown errors *) + let open Lwt.Infix in + Lwt.async (fun () -> + Lwt.catch + (fun () -> + let open Lwt_write_error.Infix in + let open Qcow_cluster_map in + let map = t.cluster_map in + Log.debug (fun f -> f "Disk clusters: %s" (to_summary_string map)) ; + let start_last_block = get_last_block map in + + let sector_size = Int64.of_int t.sector_size in + let cluster_bits = Int32.to_int t.h.Header.cluster_bits in + let sectors_per_cluster = + Int64.div (1L <| cluster_bits) sector_size + in + + let one_pass ?progress_cb () = + Qcow_cluster_map.Debug.assert_no_leaked_blocks map ; + + let moves = Qcow_cluster_map.start_moves map in + let open Lwt_write_error.Infix in + Recycler.move_all ?progress_cb t.recycler moves >>= fun () -> + (* Flush now so that if we crash after updating some of the references, the + destination blocks will contain the correct data. *) + let open Lwt.Infix in + Recycler.flush t.recycler >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> ( + let open Lwt_write_error.Infix in + Recycler.update_references t.recycler + >>= fun refs_updated -> + (* Flush now so that the pointers are persisted before we truncate the file *) + let open Lwt.Infix in + Recycler.flush t.recycler >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> + Lwt.return (Ok refs_updated) + ) + in + one_pass + ~progress_cb:(fun ~percent -> + progress_cb ~percent:(percent * 80 / 100) + ) + () + >>= fun refs_updated -> + if refs_updated <> 0L then + Log.info (fun f -> + f "Pass 1: %Ld references updated" refs_updated + ) ; + (* modifying a L2 metadata block will have cancelled the move, so + perform an additional pass. *) + one_pass + ~progress_cb:(fun ~percent -> + progress_cb ~percent:(80 + (percent * 4 / 100)) + ) + () + >>= fun refs_updated' -> + if refs_updated' <> 0L then + Log.info (fun f -> + f "Pass 2: %Ld references updated" refs_updated' + ) ; + one_pass () >>= fun refs_updated'' -> + if refs_updated'' <> 0L then + Log.err (fun f -> + f + "Failed to reach a fixed point after %Ld, %Ld and %Ld \ + block moves" + refs_updated refs_updated' refs_updated'' + ) ; + + let last_block = get_last_block map in + let open Lwt_write_error.Infix in + ( if last_block <> start_last_block then ( + Log.info (fun f -> + f "Shrink file so that last cluster was %s, now %s" + (Cluster.to_string start_last_block) + (Cluster.to_string last_block) + ) ; + + let p = + Physical.make + ((Cluster.to_int last_block + 1) lsl t.cluster_bits) + in + let size_sectors = + Physical.sector ~sector_size:t.sector_size p + in + resize_base t.base t.sector_size + (Some (t.cluster_map, t.cluster_bits)) + p + >>= fun () -> + Log.debug (fun f -> + f "Resized file to %s clusters (%Ld sectors)" + (Cluster.to_string last_block) + size_sectors + ) ; + Lwt.return (Ok ()) + ) else + Lwt.return (Ok ()) + ) + >>= fun () -> + progress_cb ~percent:100 ; + + let total_refs_updated = + Int64.(add (add refs_updated refs_updated') refs_updated'') + in + let copied = Int64.(mul total_refs_updated sectors_per_cluster) in + (* one ref per block *) + let old_size = + Int64.mul + (Cluster.to_int64 start_last_block) + sectors_per_cluster + in + let new_size = + Int64.mul (Cluster.to_int64 last_block) sectors_per_cluster + in + let report = {refs_updated; copied; old_size; new_size} in + if copied <> 0L || total_refs_updated <> 0L then + Log.info (fun f -> + f + "%Ld sectors copied, %Ld references updated, file shrunk \ + by %Ld sectors" + copied total_refs_updated + (Int64.sub old_size new_size) + ) ; + Lwt.return (Ok report) + ) + (fun e -> Lwt.return (Error (`Msg (Printexc.to_string e)))) + >>= fun result -> Lwt.wakeup u result ; Lwt.return_unit + ) ; + th + + (* If a request from the client takes more than ~30s then the client may + decide that the storage layer has failed. This could happen if a thread + was starved or if there's deadlock, so try to detect it and log something + useful. *) + let with_deadline t describe_fn nsec f = + let open Lwt.Infix in + let timeout = + Time.sleep_ns nsec >>= fun () -> Lwt.return (Error `Timeout) + in + let work = f () in + Lwt.choose [timeout; (work >>= fun x -> Lwt.return (Ok x))] >>= function + | Error `Timeout -> + Log.err (fun f -> f "%s: I/O deadline exceeded" (describe_fn ())) ; + Locks.Debug.dump_state t.locks ; + work (* return the answer anyway *) + | Ok x -> + Lwt.cancel timeout ; Lwt.return x + + let time_30s = 30_000_000_000L + + let read t sector bufs = + let describe_fn () = + Printf.sprintf "read sector = %Ld length = %d" sector (Cstructs.len bufs) + in + with_deadline t describe_fn time_30s (fun () -> + let open Lwt_error.Infix in + Counter.inc + (Metrics.reads t.config.Config.id) + (float_of_int + @@ List.fold_left ( + ) 0 + @@ List.map Cstruct.length bufs + ) ; + let sectors_per_cluster = (1 lsl t.cluster_bits) / t.sector_size in + let client = Locks.Client.make describe_fn in + let cluster_size = 1L <| t.cluster_bits in + let byte = + Int64.(mul sector (of_int t.info.Mirage_block.sector_size)) + in + Error.Lwt_error.List.map_p + (fun (byte, buf) -> + let vaddr = Virtual.make ~cluster_bits:t.cluster_bits byte in + ClusterIO.walk_readonly ~client t vaddr >>= function + | None -> + Cstruct.memset buf 0 ; + Lwt.return (Ok None) (* no work to do *) + | Some (offset', l1_lock, l2_lock) -> + let sector = + Physical.sector ~sector_size:t.sector_size offset' + in + Lwt.return + (Ok + (Some + { + sector + ; bufs= [buf] + ; metadata_locks= [l1_lock; l2_lock] + } + ) + ) + ) + (chop_into_aligned cluster_size byte bufs) + >>= fun work -> + let work' = + List.rev + @@ List.fold_left + (fun acc x -> match x with None -> acc | Some y -> y :: acc) + [] work + in + (* work may contain contiguous items *) + let work = coalesce_into_adjacent t.sector_size work' in + let open Lwt.Infix in + iter_p + (fun work -> + let first = + Cluster.of_int64 + Int64.(div work.sector (of_int sectors_per_cluster)) + in + let last_sector = + Int64.( + add work.sector (of_int (Cstructs.len work.bufs / t.sector_size)) + ) + in + let last_sector' = + Int64.(round_up last_sector (of_int sectors_per_cluster)) + in + let last = + Cluster.of_int64 + Int64.(div last_sector' (of_int sectors_per_cluster)) + in + Lwt.finalize + (fun () -> + Locks.Read.with_locks t.locks ~first ~last (fun () -> + Lwt.catch + (fun () -> B.read t.base work.sector work.bufs) + (fun e -> + Log.err (fun f -> + f "%s: low-level I/O exception %s" (describe_fn ()) + (Printexc.to_string e) + ) ; + Locks.Debug.dump_state t.locks ; + let cluster = + Cluster.of_int + (Int64.to_int work.sector / sectors_per_cluster) + in + Qcow_debug.check_references t.metadata t.cluster_map + ~cluster_bits:t.cluster_bits cluster + >>= fun _ -> + Cache.Debug.check_disk t.cache >>= fun _ -> Lwt.fail e + ) + ) + >>= function + | Error e -> + Lwt.return_error (adapt_error e) + | Ok () -> + Lwt.return (Ok ()) + ) + (fun () -> + List.iter Locks.unlock work.metadata_locks ; + Lwt.return_unit + ) + ) + work + >>= fun result -> + Locks.Debug.assert_no_locks_held client ; + Lwt.return result + ) + + let write t sector bufs = + let describe_fn () = + Printf.sprintf "write sector = %Ld length = %d" sector (Cstructs.len bufs) + in + if t.config.Config.read_only then + Lwt.return (Error `Is_read_only) + else + with_deadline t describe_fn time_30s (fun () -> + let open Lwt_write_error.Infix in + Counter.inc + (Metrics.writes t.config.Config.id) + (float_of_int + @@ List.fold_left ( + ) 0 + @@ List.map Cstruct.length bufs + ) ; + let cluster_size = 1L <| t.cluster_bits in + let client = Locks.Client.make describe_fn in + let sectors_per_cluster = (1 lsl t.cluster_bits) / t.sector_size in + let byte = + Int64.(mul sector (of_int t.info.Mirage_block.sector_size)) + in + Error.Lwt_error.List.map_p + (fun (byte, buf) -> + let vaddr = Virtual.make ~cluster_bits:t.cluster_bits byte in + ClusterIO.walk_readonly ~client t vaddr >>= function + | None -> + (* Only the first write to this area needs to allocate, so it's ok + to make this a little slower *) + Lwt.catch + (fun () -> + ClusterIO.walk_and_allocate ~client t vaddr + >>= fun (offset', l1_lock, l2_lock) -> + let sector = + Physical.sector ~sector_size:t.sector_size offset' + in + Lwt.return + (Ok + { + sector + ; bufs= [buf] + ; metadata_locks= [l1_lock; l2_lock] + } + ) + ) + (function + | Error.Duplicate_reference ((c, w), (c', w'), target) as + e -> + Log.err (fun f -> + f "Duplicate_reference during %s" (describe_fn ()) + ) ; + Qcow_debug.on_duplicate_reference t.metadata + t.cluster_map ~cluster_bits:t.cluster_bits (c, w) + (c', w') target + >>= fun () -> Lwt.fail e + | e -> + Lwt.fail e + ) + | Some (offset', l1_lock, l2_lock) -> + let sector = + Physical.sector ~sector_size:t.sector_size offset' + in + Lwt.return + (Ok + {sector; bufs= [buf]; metadata_locks= [l1_lock; l2_lock]} + ) + ) + (chop_into_aligned cluster_size byte bufs) + >>= fun work' -> + (let open Lwt.Infix in + if !DebugSetting.compact_mid_write then ( + Log.debug (fun f -> f "DebugSetting.compact_mid_write") ; + compact t () >>= fun _ -> Lwt.return (Ok ()) + ) else + Lwt.return (Ok ()) + ) + >>= fun () -> + (* work may contain contiguous items *) + let work = coalesce_into_adjacent t.sector_size work' in + let open Lwt.Infix in + iter_p + (fun work -> + let first = + Cluster.of_int64 + Int64.(div work.sector (of_int sectors_per_cluster)) + in + let last_sector = + Int64.( + add work.sector + (of_int (Cstructs.len work.bufs / t.sector_size)) + ) + in + let last_sector' = + Int64.(round_up last_sector (of_int sectors_per_cluster)) + in + let last = + Cluster.of_int64 + Int64.(div last_sector' (of_int sectors_per_cluster)) + in + Locks.Write.with_locks ~client t.locks ~first ~last (fun () -> + (* Cancel any in-progress move since the data will be stale *) + let rec loop n = + if n > last then + () + else ( + Qcow_cluster_map.cancel_move t.cluster_map n ; + loop (Cluster.succ n) + ) + in + loop first ; + Lwt.finalize + (fun () -> + Lwt.catch + (fun () -> + B.write t.base work.sector work.bufs >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> + Lwt.return (Ok ()) + ) + (fun e -> + Log.err (fun f -> + f "%s: low-level I/O exception %s" + (describe_fn ()) (Printexc.to_string e) + ) ; + Locks.Debug.dump_state t.locks ; + let cluster = + Cluster.of_int + (Int64.to_int work.sector / sectors_per_cluster) + in + Qcow_debug.check_references t.metadata t.cluster_map + ~cluster_bits:t.cluster_bits cluster + >>= fun _ -> + Cache.Debug.check_disk t.cache >>= fun _ -> Lwt.fail e + ) + ) + (fun () -> + List.iter Locks.unlock work.metadata_locks ; + Lwt.return_unit + ) + ) + ) + work + >>= fun result -> + Locks.Debug.assert_no_locks_held client ; + Lwt.return result + ) + + let seek_mapped t from = + let open Lwt_error.Infix in + let bytes = Int64.(mul from (of_int t.sector_size)) in + let int64s_per_cluster = 1L <| Int32.to_int t.h.Header.cluster_bits - 3 in + let rec scan_l1 a = + if a.Virtual.l1_index >= Int64.of_int32 t.h.Header.l1_size then + Lwt.return + (Ok Int64.(mul t.info.Mirage_block.size_sectors (of_int t.sector_size)) + ) + else + ClusterIO.find_mapped_l1_table t a.Virtual.l1_index >>= function + | None -> + Lwt.return + (Ok + Int64.( + mul t.info.Mirage_block.size_sectors (of_int t.sector_size) + ) + ) + | Some l1_index -> + let a = {a with Virtual.l1_index} in + ClusterIO.read_l1_table t a.Virtual.l1_index >>= fun (x, l1_lock) -> + Locks.unlock l1_lock ; + if Physical.to_bytes x = 0 then + scan_l1 + { + a with + Virtual.l1_index= Int64.succ a.Virtual.l1_index + ; l2_index= 0L + } + else + let rec scan_l2 a = + if a.Virtual.l2_index >= int64s_per_cluster then + scan_l1 + { + a with + Virtual.l1_index= Int64.succ a.Virtual.l1_index + ; l2_index= 0L + } + else + ClusterIO.read_l2_table t x a.Virtual.l2_index + >>= fun (x, l2_lock) -> + Locks.unlock l2_lock ; + if Physical.to_bytes x = 0 then + scan_l2 + {a with Virtual.l2_index= Int64.succ a.Virtual.l2_index} + else + Lwt.return + (Ok (Qcow_virtual.to_offset ~cluster_bits:t.cluster_bits a) + ) + in + scan_l2 a + in + scan_l1 (Virtual.make ~cluster_bits:t.cluster_bits bytes) >>= fun offset -> + let x = Int64.(div offset (of_int t.sector_size)) in + assert (x >= from) ; + Lwt.return (Ok x) + + let seek_unmapped t from = + let open Lwt_error.Infix in + let bytes = Int64.(mul from (of_int t.sector_size)) in + let int64s_per_cluster = 1L <| Int32.to_int t.h.Header.cluster_bits - 3 in + let rec scan_l1 a = + if a.Virtual.l1_index >= Int64.of_int32 t.h.Header.l1_size then + Lwt.return + (Ok Int64.(mul t.info.Mirage_block.size_sectors (of_int t.sector_size)) + ) + else + ClusterIO.read_l1_table t a.Virtual.l1_index >>= fun (x, l1_lock) -> + Locks.unlock l1_lock ; + if Physical.to_bytes x = 0 then + Lwt.return (Ok (Qcow_virtual.to_offset ~cluster_bits:t.cluster_bits a)) + else + let rec scan_l2 a = + if a.Virtual.l2_index >= int64s_per_cluster then + scan_l1 + { + a with + Virtual.l1_index= Int64.succ a.Virtual.l1_index + ; l2_index= 0L + } + else + ClusterIO.read_l2_table t x a.Virtual.l2_index + >>= fun (y, l2_lock) -> + Locks.unlock l2_lock ; + if Physical.to_bytes y = 0 then + Lwt.return + (Ok (Qcow_virtual.to_offset ~cluster_bits:t.cluster_bits a)) + else + scan_l2 {a with Virtual.l2_index= Int64.succ a.Virtual.l2_index} + in + scan_l2 a + in + scan_l1 (Virtual.make ~cluster_bits:t.cluster_bits bytes) >>= fun offset -> + let x = Int64.(div offset (of_int t.sector_size)) in + assert (x >= from) ; + Lwt.return (Ok x) + + let disconnect t = B.disconnect t.base + + let make config base h = + let open Lwt in + B.get_info base >>= fun base_info -> + (* The virtual disk has 512 byte sectors *) + let info' = + { + Mirage_block.read_write= false + ; sector_size= 512 + ; size_sectors= Int64.(div h.Header.size 512L) + } + in + (* We assume the backing device is resized dynamically so the + size is the address of the next cluster *) + let sector_size = base_info.Mirage_block.sector_size in + let cluster_bits = Int32.to_int h.Header.cluster_bits in + (* The first cluster is allocated after the L1 table *) + let cluster_size = 1L <| cluster_bits in + (* qemu-img will allocate a cluster by writing only a single sector to the end + of the file. We insist that the file is a whole number of clusters in size. *) + let sectors_per_cluster = + Int64.(div (1L <| cluster_bits) (of_int sector_size)) + in + let new_size_sectors = + Int64.round_up base_info.Mirage_block.size_sectors sectors_per_cluster + in + ( if new_size_sectors > base_info.Mirage_block.size_sectors then ( + Log.info (fun f -> + f "rounding up file to a whole number of clusters (= %Ld sectors)" + new_size_sectors + ) ; + B.resize base new_size_sectors >>= function + | Error _ -> + Lwt.fail_with "resizing file" + | Ok () -> + Lwt.return_unit + ) else + Lwt.return_unit + ) + >>= fun () -> + let locks = Locks.make () in + let read_cluster i = + let buf = malloc h in + let cluster = Cluster.to_int64 i in + let offset = cluster <| cluster_bits in + let sector = Int64.(div offset (of_int sector_size)) in + let open Lwt.Infix in + Lwt.catch + (fun () -> + B.read base sector [buf] >>= function + | Error _ -> + Lwt.fail_with "unknown error" + | Ok () -> + Lwt.return (Ok buf) + ) + (fun e -> + Log.err (fun f -> + f "read_cluster %Ld: low-level I/O exception %s" cluster + (Printexc.to_string e) + ) ; + Locks.Debug.dump_state locks ; + Lwt.fail e + ) + in + let write_cluster i buf = + if config.Config.read_only then + Lwt.return (Error `Is_read_only) + else + let cluster = Cluster.to_int64 i in + let offset = cluster <| cluster_bits in + let sector = Int64.(div offset (of_int sector_size)) in + Lwt.catch + (fun () -> + B.write base sector [buf] >>= function + | Error `Disconnected -> + Lwt.return (Error `Disconnected) + | Error `Is_read_only -> + Lwt.return (Error `Is_read_only) + | Error _ -> + Lwt.fail_with "unknown error" + | Ok () -> + Lwt.return (Ok ()) + ) + (fun e -> + Log.err (fun f -> + f "write_cluster %Ld: low-level I/O exception %s" cluster + (Printexc.to_string e) + ) ; + Locks.Debug.dump_state locks ; + Lwt.fail e + ) + in + let cache = Cache.create ~read_cluster ~write_cluster () in + let metadata = Metadata.make ~cache ~cluster_bits ~locks () in + let recycler = + Recycler.create ~base ~sector_size ~cluster_bits ~cache ~locks ~metadata + ~runtime_asserts:config.Config.runtime_asserts + in + let lazy_refcounts = + match h.Header.additional with + | Some {Header.lazy_refcounts= true; _} -> + true + | _ -> + false + in + let stats = Stats.zero in + let cluster_map = Qcow_cluster_map.zero in + let cluster_map_m = Lwt_mutex.create () in + let t' = + { + h + ; base + ; info= info' + ; config + ; locks + ; recycler + ; metadata + ; cache + ; sector_size + ; cluster_bits + ; lazy_refcounts + ; stats + ; cluster_map + ; cluster_map_m + } + in + Lwt_error.or_fail_with @@ make_cluster_map t' ~id:config.Config.id () + >>= fun cluster_map -> + if config.Config.runtime_asserts then + Qcow_cluster_map.Debug.assert_equal cluster_map cluster_map ; + (* An opened file may have junk at the end, which means that we would simultaneously + allocate from it (get_last_block + n) as well as erase and recycle it. + We should trim the file now so that it is safe to allocate from it as normal. + Normally when the file is expanded the blocks at the end are not considered to be + junk. *) + let last_block = Qcow_cluster_map.get_last_block cluster_map in + let size_clusters = Cluster.succ last_block in + let p = Physical.make (Cluster.to_int size_clusters lsl cluster_bits) in + let size_sectors = Physical.sector ~sector_size p in + ( if config.Config.read_only then + Lwt.return_unit + else + Lwt_write_error.or_fail_with @@ resize_base base sector_size None p + >>= fun () -> + Log.info (fun f -> + f "Resized file to %s clusters (%Ld sectors)" + (Cluster.to_string size_clusters) + size_sectors + ) ; + Qcow_cluster_map.resize cluster_map size_clusters ; + Lwt.return_unit + ) + >>= fun () -> + t'.cluster_map <- cluster_map ; + Metadata.set_cluster_map t'.metadata cluster_map ; + Recycler.set_cluster_map t'.recycler cluster_map ; + if config.Config.read_only then + Lwt.return t' + else ( + ( match config.Config.keep_erased with + | None -> + () + | Some sectors -> + let keep_erased = + Int64.(div (mul sectors (of_int sector_size)) cluster_size) + in + let compact_after_unmaps = + match config.Config.compact_after_unmaps with + | None -> + None + | Some sectors -> + Some Int64.(div (mul sectors (of_int sector_size)) cluster_size) + in + Recycler.start_background_thread t'.recycler ~keep_erased + ?compact_after_unmaps () + ) ; + ( if config.Config.discard && not lazy_refcounts then ( + Log.info (fun f -> + f + "discard requested and lazy_refcounts is disabled: erasing \ + refcount table and enabling lazy_refcounts" + ) ; + Lwt_error.or_fail_with @@ ClusterIO.Refcount.zero_all t' >>= fun () -> + let additional = + match h.Header.additional with + | Some h -> + {h with Header.lazy_refcounts= true} + | None -> + { + Header.dirty= true + ; corrupt= false + ; lazy_refcounts= true + ; autoclear_features= 0L + ; refcount_order= 4l + } + in + let extensions = [`Feature_name_table Header.Feature.understood] in + let h = {h with Header.additional= Some additional; extensions} in + Lwt_write_error.or_fail_with @@ update_header t' h >>= fun () -> + t'.lazy_refcounts <- true ; + Lwt.return_unit + ) else + Lwt.return_unit + ) + >>= fun () -> + Recycler.flush t'.recycler >>= function + | Error _ -> + Log.err (fun f -> f "initial flush failed") ; + Lwt.fail (Failure "initial flush failed") + | Ok () -> + Lwt.return t' + ) + + let connect ?(config = Config.default ()) base = + let open Lwt.Infix in + B.get_info base >>= fun base_info -> + let sector = + Cstruct.sub + Io_page.(to_cstruct (get 1)) + 0 base_info.Mirage_block.sector_size + in + B.read base 0L [sector] >>= function + | Error e -> + Format.kasprintf Lwt.fail_with "%a" B.pp_error e + | Ok () -> ( + match Header.read sector with + | Error (`Msg m) -> + Lwt.fail_with m + | Ok (h, _) -> + make config base h >>= fun t -> + let open Qcow_cluster_map in + let free = total_free t.cluster_map in + let used = total_used t.cluster_map in + Log.info (fun f -> + f "image has %Ld free sectors and %Ld used sectors" free used + ) ; + Lwt.return t + ) + + let check base = + let open Lwt.Infix in + let open Qcow_cluster_map in + Lwt.catch + (fun () -> + let config = Config.create ~read_only:true () in + connect ~config base >>= fun t -> + let free = total_free t.cluster_map in + let used = total_used t.cluster_map in + Lwt.return (Ok {free; used}) + ) + (function + | Reference_outside_file (src, dst) -> + Lwt.return (Error (`Reference_outside_file (src, dst))) + | Error.Duplicate_reference ((c, w), (c', w'), dst) -> + Lwt.return (Error (`Duplicate_reference ((c, w), (c', w'), dst))) + | e -> + Lwt.fail e + ) + + let resize t ~new_size:requested_size_bytes ?(ignore_data_loss = false) () = + if t.config.Config.read_only then + Lwt.return (Error `Is_read_only) + else + let existing_size = t.h.Header.size in + if existing_size > requested_size_bytes && not ignore_data_loss then + Lwt.return + (Error + (`Msg + (Printf.sprintf + "Requested resize would result in data loss: requested size \ + = %Ld but current size = %Ld" + requested_size_bytes existing_size + ) + ) + ) + else + let size = Int64.round_up requested_size_bytes 512L in + let l2_tables_required = + Header.l2_tables_required ~cluster_bits:t.cluster_bits size + in + (* Keep it simple for now by refusing resizes which would require us to + reallocate the L1 table. *) + let l2_entries_per_cluster = + 1L <| Int32.to_int t.h.Header.cluster_bits - 3 + in + let old_max_entries = + Int64.round_up + (Int64.of_int32 t.h.Header.l1_size) + l2_entries_per_cluster + in + let new_max_entries = + Int64.round_up l2_tables_required l2_entries_per_cluster + in + if new_max_entries > old_max_entries then + Lwt.return + (Error + (`Msg + "I don't know how to resize in the case where the L1 table \ + needs new clusters:" + ) + ) + else + update_header t + {t.h with Header.l1_size= Int64.to_int32 l2_tables_required; size} + + let zero = + let page = Io_page.(to_cstruct (get 1)) in + Cstruct.memset page 0 ; page + + let rec erase t ~sector ~n () = + let open Lwt_write_error.Infix in + if n <= 0L then + Lwt.return (Ok ()) + else + (* This could walk one cluster at a time instead of one sector at a time *) + let byte = Int64.(mul sector (of_int t.info.Mirage_block.sector_size)) in + let vaddr = Virtual.make ~cluster_bits:t.cluster_bits byte in + (ClusterIO.walk_readonly t vaddr >>= function + | None -> + (* Already zero, nothing to do *) + Lwt.return (Ok ()) + | Some (offset', l1_lock, l2_lock) -> + Lwt.finalize + (fun () -> + let base_sector, _ = + Physical.to_sector ~sector_size:t.sector_size offset' + in + t.stats.Stats.nr_erased <- Int64.succ t.stats.Stats.nr_erased ; + let open Lwt.Infix in + B.write t.base base_sector + [Cstruct.sub zero 0 t.info.Mirage_block.sector_size] + >>= adapt_write_error_result + ) + (fun () -> + Locks.unlock l1_lock ; Locks.unlock l2_lock ; Lwt.return_unit + ) + ) + >>= fun () -> erase t ~sector:(Int64.succ sector) ~n:(Int64.pred n) () + + let discard t ~sector ~n () = + let describe_fn () = Printf.sprintf "discard sector %Ld n %Ld" sector n in + with_deadline t describe_fn time_30s (fun () -> + let open Lwt_write_error.Infix in + ( if not t.config.Config.discard then ( + Log.err (fun f -> + f "discard called but feature not implemented in configuration" + ) ; + Lwt.fail (Failure "Unimplemented") + ) else + Lwt.return (Ok ()) + ) + >>= fun () -> + Counter.inc + (Metrics.discards t.config.Config.id) + Int64.(to_float @@ mul n @@ of_int t.sector_size) ; + let client = Locks.Client.make describe_fn in + + (* we can only discard whole clusters. We will explicitly zero non-cluster + aligned discards in order to satisfy RZAT *) + + (* round sector, n up to a cluster boundary *) + let sectors_per_cluster = + Int64.(div (1L <| t.cluster_bits) (of_int t.sector_size)) + in + let sector' = Int64.round_up sector sectors_per_cluster in + + (* we can only discard whole clusters. We will explicitly zero non-cluster + aligned discards in order to satisfy RZAT *) + let to_erase = min n (Int64.sub sector' sector) in + erase t ~sector ~n:to_erase () >>= fun () -> + let n' = Int64.sub n to_erase in + + let to_discard = Int64.round_down n' sectors_per_cluster in + ClusterIO.walk_and_deallocate ~client t sector' to_discard >>= fun () -> + erase t + ~sector:(Int64.add sector' to_discard) + ~n:(Int64.sub n' to_discard) () + ) + + let create base ~size ?(lazy_refcounts = true) ?(cluster_bits = 16) + ?(config = Config.default ()) () = + let version = `Three in + let backing_file_offset = 0L in + let backing_file_size = 0l in + let cluster_size = 1 lsl cluster_bits in + let crypt_method = `None in + (* qemu-img places the refcount table next in the file and only + qemu-img creates a tiny refcount table and grows it on demand *) + let refcount_table_offset = Physical.make cluster_size in + let refcount_table_clusters = 1 in + + (* qemu-img places the L1 table after the refcount table *) + let l1_table_offset = + Physical.make ((refcount_table_clusters + 1) lsl cluster_bits) + in + let l2_tables_required = Header.l2_tables_required ~cluster_bits size in + let nb_snapshots = 0l in + let snapshots_offset = 0L in + let additional = + Some + { + Header.dirty= lazy_refcounts + ; corrupt= false + ; lazy_refcounts + ; autoclear_features= 0L + ; refcount_order= 4l + } + in + let extensions = [`Feature_name_table Header.Feature.understood] in + let h = + { + Header.version + ; backing_file_offset + ; backing_file_size + ; cluster_bits= Int32.of_int cluster_bits + ; size + ; crypt_method + ; l1_size= Int64.to_int32 l2_tables_required + ; l1_table_offset + ; refcount_table_offset + ; refcount_table_clusters= Int32.of_int refcount_table_clusters + ; nb_snapshots + ; snapshots_offset + ; additional + ; extensions + } + in + (* Resize the underlying device to contain the header + refcount table + + l1 table. Future allocations will enlarge the file. *) + let l1_size_bytes = 8 * Int64.to_int l2_tables_required in + let next_free_byte = + Int.round_up + (Physical.to_bytes l1_table_offset + l1_size_bytes) + cluster_size + in + let open Lwt in + B.get_info base >>= fun base_info -> + (* Erase existing contents *) + let open Lwt_write_error.Infix in + resize_base base base_info.Mirage_block.sector_size None (Physical.make 0) + >>= fun () -> + let p = Physical.make next_free_byte in + resize_base base base_info.Mirage_block.sector_size None p >>= fun () -> + let open Lwt.Infix in + make config base h >>= fun t -> + let open Lwt_write_error.Infix in + update_header t h >>= fun () -> + (* Write an initial empty refcount table *) + let cluster = malloc t.h in + Cstruct.memset cluster 0 ; + let open Lwt.Infix in + B.write base + (Physical.sector ~sector_size:t.sector_size refcount_table_offset) + [cluster] + >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> ( + let open Lwt_write_error.Infix in + let next_cluster = next_free_byte / cluster_size in + let rec loop limit i = + if i = limit then + Lwt.return (Ok ()) + else + ClusterIO.Refcount.incr t (Cluster.of_int i) >>= fun () -> + loop limit (i + 1) + in + (* Increase the refcount of all header clusters i.e. those < next_free_cluster *) + loop next_cluster 0 >>= fun () -> + (* Write an initial empty L1 table *) + let open Lwt.Infix in + B.write base + (Physical.sector ~sector_size:t.sector_size l1_table_offset) + [cluster] + >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> ( + Recycler.flush t.recycler >>= function + | Error e -> + Lwt.return_error (adapt_write_error e) + | Ok () -> + Lwt.return (Ok t) + ) + ) + + let rebuild_refcount_table t = + let open Lwt_write_error.Infix in + let client = Locks.Client.make (fun () -> "rebuild_refcount_table") in + (* Disable lazy refcounts so we actually update the real refcounts *) + let lazy_refcounts = t.lazy_refcounts in + t.lazy_refcounts <- false ; + Log.info (fun f -> f "Zeroing existing refcount table") ; + ClusterIO.Refcount.zero_all ~client t >>= fun () -> + let cluster = + Physical.cluster ~cluster_bits:t.cluster_bits + t.h.Header.refcount_table_offset + in + let refcount_table_clusters = + Int32.to_int t.h.Header.refcount_table_clusters + in + let rec loop i = + if i >= refcount_table_clusters then + Lwt.return (Ok ()) + else + ClusterIO.Refcount.incr ~client t Cluster.(add cluster (of_int i)) + >>= fun () -> + (* If any of the table entries point to a block, increase its refcount too *) + Metadata.read ~client t.metadata + Cluster.(add cluster (of_int i)) + (fun c -> + let addresses = Metadata.Physical.of_contents c in + Lwt.return (Ok addresses) + ) + >>= fun addresses -> + let rec inner i = + if i >= Metadata.Physical.len addresses then + Lwt.return (Ok ()) + else + let addr = Metadata.Physical.get addresses i in + ( if addr <> Physical.unmapped then ( + let cluster' = + Physical.cluster ~cluster_bits:t.cluster_bits addr + in + Log.debug (fun f -> + f "Refcount cluster %s has reference to cluster %s" + (Cluster.to_string cluster) + (Cluster.to_string cluster') + ) ; + (* It might have been incremented already by a previous `incr` *) + ClusterIO.Refcount.read ~client t cluster' >>= function + | 0 -> + ClusterIO.Refcount.incr ~client t cluster' + | _ -> + Lwt.return (Ok ()) + ) else + Lwt.return (Ok ()) + ) + >>= fun () -> inner (i + 1) + in + inner 0 >>= fun () -> loop (i + 1) + in + Log.info (fun f -> f "Incrementing refcount of the refcount table clusters") ; + loop 0 >>= fun () -> + (* Increment the refcount of the header and L1 table *) + Log.info (fun f -> f "Incrementing refcount of the header") ; + ClusterIO.Refcount.incr ~client t Cluster.zero >>= fun () -> + let l1_table_clusters = + let refs_per_cluster = 1L <| t.cluster_bits - 3 in + Int64.( + to_int + @@ div + (round_up (of_int32 t.h.Header.l1_size) refs_per_cluster) + refs_per_cluster + ) + in + let l1_table_cluster = + Physical.cluster ~cluster_bits:t.cluster_bits t.h.Header.l1_table_offset + in + let rec loop i = + if i >= l1_table_clusters then + Lwt.return (Ok ()) + else + ClusterIO.Refcount.incr ~client t + Cluster.(add l1_table_cluster (of_int i)) + >>= fun () -> + (* Increment clusters of L1 tables *) + Metadata.read ~client t.metadata + Cluster.(add l1_table_cluster (of_int i)) + (fun c -> + let addresses = Metadata.Physical.of_contents c in + Lwt.return (Ok addresses) + ) + >>= fun addresses -> + let rec inner i = + if i >= Metadata.Physical.len addresses then + Lwt.return (Ok ()) + else + let addr = Metadata.Physical.get addresses i in + ( if addr <> Physical.unmapped then ( + let cluster' = + Physical.cluster ~cluster_bits:t.cluster_bits addr + in + Log.debug (fun f -> + f "L1 cluster %s has reference to L2 cluster %s" + (Cluster.to_string cluster) + (Cluster.to_string cluster') + ) ; + ClusterIO.Refcount.incr ~client t cluster' + ) else + Lwt.return (Ok ()) + ) + >>= fun () -> inner (i + 1) + in + inner 0 >>= fun () -> loop (i + 1) + in + Log.info (fun f -> + f "Incrementing refcount of the %Ls L1 table clusters starting at %s" + l1_table_clusters + (Cluster.to_string l1_table_cluster) + ) ; + loop 0 >>= fun () -> + (* Fold over the mapped data, incrementing refcounts along the way *) + let sectors_per_cluster = + Int64.(div (1L <| t.cluster_bits) (of_int t.sector_size)) + in + let rec loop sector = + if sector >= t.info.Mirage_block.size_sectors then + Lwt.return (Ok ()) + else + seek_mapped t sector >>= fun mapped_sector -> + if mapped_sector <> sector then + loop mapped_sector + else + ClusterIO.walk_readonly ~client t + (Virtual.make ~cluster_bits:t.cluster_bits + Int64.(mul (of_int t.info.Mirage_block.sector_size) mapped_sector) + ) + >>= function + | None -> + assert false + | Some (offset', l1_lock, l2_lock) -> + Locks.unlock l1_lock ; + Locks.unlock l2_lock ; + let cluster = + Physical.cluster ~cluster_bits:t.cluster_bits offset' + in + ClusterIO.Refcount.incr ~client t cluster >>= fun () -> + loop (Int64.add mapped_sector sectors_per_cluster) + in + Log.info (fun f -> f "Incrementing refcount of the data clusters") ; + loop 0L >>= fun () -> + (* Restore the original lazy_refcount setting *) + t.lazy_refcounts <- lazy_refcounts ; + Lwt.return (Ok ()) + + let flush t = + let open Lwt.Infix in + Recycler.flush t.recycler >>= adapt_write_error_result + + let header t = t.h + + module Debug = struct + let check_no_overlaps t = + let within = + Physical.within_cluster ~cluster_bits:t.cluster_bits + t.h.Header.l1_table_offset + in + assert (within = 0) ; + let within = + Physical.within_cluster ~cluster_bits:t.cluster_bits + t.h.Header.refcount_table_offset + in + assert (within = 0) ; + Lwt.return (Ok ()) + + let assert_no_leaked_blocks t = + Qcow_cluster_map.Debug.assert_no_leaked_blocks t.cluster_map + + let assert_cluster_map_in_sync t = + let open Lwt.Infix in + Lwt_error.or_fail_with @@ make_cluster_map t () >>= fun cluster_map -> + Qcow_cluster_map.Debug.assert_equal cluster_map t.cluster_map ; + Lwt.return_unit + + module Setting = DebugSetting + + let metadata_blocks t = + let clusters = Qcow_cluster_map.Debug.metadata_blocks t.cluster_map in + Qcow_types.Cluster.( + IntervalSet.( + fold (fun i acc -> + let x, y = + Interval. + ( to_int64 (x i) <| t.cluster_bits + , (* the last inclusive byte = next cluster start - 1 *) + Int64.pred (to_int64 (succ @@ y i) <| t.cluster_bits) + ) + in + Qcow_types.Int64.IntervalSet.(add (Interval.make x y) acc) + ) + ) + ) + clusters Qcow_types.Int64.IntervalSet.empty + end +end diff --git a/ocaml/qcow-tool/lib/qcow.mli b/ocaml/qcow-tool/lib/qcow.mli new file mode 100644 index 00000000000..1dac9453087 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow.mli @@ -0,0 +1,187 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +module Error = Qcow_error +module Header = Qcow_header +module Physical = Qcow_physical +module Int64 = Qcow_types.Int64 + +module Make (B : Qcow_s.RESIZABLE_BLOCK) (Time : Mirage_time.S) : sig + include Mirage_block.S + + module Config : sig + (** Runtime configuration of a device *) + type t = { + id: string (** unique name for prometheus metrics *) + ; discard: bool (** true if `discard` will be enabled at runtime *) + ; keep_erased: int64 option (** size of erased free pool in sectors *) + ; compact_after_unmaps: int64 option + (** automatically compact after n sectors are unmapped *) + ; check_on_connect: bool (** perform an integrity check on connect *) + ; runtime_asserts: bool (** check cluster invariants at runtime *) + ; read_only: bool (** guarantee to not modify the file *) + } + + val create : + ?id:string + -> ?discard:bool + -> ?keep_erased:int64 + -> ?compact_after_unmaps:int64 + -> ?check_on_connect:bool + -> ?runtime_asserts:bool + -> ?read_only:bool + -> unit + -> t + (** Customise the runtime behaviour, see [connect] or [create] *) + + val to_string : t -> string + (** Marshal a config into a string suitable for a command-line argument *) + + val of_string : string -> (t, [`Msg of string]) result + (** Parse the result of a previous [to_string] invocation *) + end + + module Stats : sig + (** Runtime statistics on a device *) + type t = { + mutable nr_erased: int64 (** number of sectors erased during discard *) + ; mutable nr_unmapped: int64 + (** number of sectors unmapped during discard *) + } + end + + val create : + B.t + -> size:int64 + -> ?lazy_refcounts:bool + -> ?cluster_bits:int + -> ?config:Config.t + -> unit + -> (t, write_error) result Lwt.t + (** [create block ~size ?lazy_refcounts ?cluster_bits ?config ()] initialises + a qcow-formatted image on [block] with virtual size [size] in bytes. + + By default the file will use lazy refcounts, but this can be overriden by supplying + [~lazy_refcounts:false]. By default the file will use 64KiB clusters (= 16 bits) + but this can be overridden by supplying [?cluster_bits]. Note the cluster size + must be greater than the sector size on the underlying block device. + + The [?config] argument does not affect the on-disk format but rather the + behaviour as seen from this client. *) + + val connect : ?config:Config.t -> B.t -> t Lwt.t + (** [connect ?config block] connects to an existing qcow-formatted image on + [block]. *) + + val resize : + t + -> new_size:int64 + -> ?ignore_data_loss:bool + -> unit + -> (unit, write_error) result Lwt.t + (** [resize block new_size_bytes ?ignore_data_loss] changes the size of the + qcow-formatted image to [new_size_bytes], rounded up to the next allocation + unit. This function will fail with an error if the new size would be + smaller than the old size as this would cause data loss, unless the argument + [?ignore_data_loss] is set to true. *) + + (** Summary of the compaction run *) + type compact_result = { + copied: int64 (** number of sectors copied *) + ; refs_updated: int64 (** number of cluster references updated *) + ; old_size: int64 (** previous size in sectors *) + ; new_size: int64 (** new size in sectors *) + } + + val compact : + t + -> ?progress_cb:(percent:int -> unit) + -> unit + -> (compact_result, write_error) result Lwt.t + (** [compact t ()] scans the disk for unused space and attempts to fill it + and shrink the file. This is useful if the underlying block device doesn't + support discard and we must emulate it. *) + + val discard : + t -> sector:int64 -> n:int64 -> unit -> (unit, write_error) result Lwt.t + (** [discard sector n] signals that the [n] sectors starting at [sector] + are no longer needed and the contents may be discarded. Note the contents + may not actually be deleted: this is not a "secure erase". *) + + val seek_unmapped : t -> int64 -> (int64, error) result Lwt.t + (** [seek_unmapped t start] returns the offset of the next "hole": a region + of the device which is guaranteed to be full of zeroes (typically + guaranteed because it is unmapped) *) + + val seek_mapped : t -> int64 -> (int64, error) result Lwt.t + (** [seek_mapped t start] returns the offset of the next region of the + device which may have data in it (typically this is the next mapped + region) *) + + val rebuild_refcount_table : t -> (unit, write_error) result Lwt.t + (** [rebuild_refcount_table t] rebuilds the refcount table from scratch. + Normally we won't update the refcount table live, for performance. *) + + type check_result = { + free: int64 (** unused sectors *) + ; used: int64 (** used sectors *) + } + + val check : + B.t + -> ( check_result + , [ Mirage_block.error + | `Reference_outside_file of int64 * int64 + | `Duplicate_reference of (int64 * int) * (int64 * int) * int64 + | `Msg of string ] + ) + result + Lwt.t + (** [check t] performs sanity checks of the file, looking for errors. + The error [`Reference_outside_file (src, dst)] means that at offset [src] + there is a reference to offset [dst] which is outside the file. + The error [`Duplicate_reference (ref1, ref2, target) means that references + at both [ref1] and [ref2] both point to the same [target] offset. *) + + val flush : t -> (unit, write_error) result Lwt.t + (** [flush t] flushes any outstanding buffered writes *) + + val header : t -> Header.t + (** Return a snapshot of the current header *) + + val to_config : t -> Config.t + (** [to_config t] returns the configuration of a device *) + + val get_stats : t -> Stats.t + (** [get_stats t] returns the runtime statistics of a device *) + + module Debug : sig + val check_no_overlaps : t -> (unit, write_error) result Lwt.t + + val assert_no_leaked_blocks : t -> unit + + val assert_cluster_map_in_sync : t -> unit Lwt.t + + module Setting : sig + val compact_mid_write : bool ref + (** true means to trigger a compact part-way through a write to check that + the write completes properly after the compact *) + end + + val metadata_blocks : t -> Int64.IntervalSet.t + (** Return the set of physical disk offsets containing metadata *) + end +end diff --git a/ocaml/qcow-tool/lib/qcow.mllib b/ocaml/qcow-tool/lib/qcow.mllib new file mode 100644 index 00000000000..111c463217c --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow.mllib @@ -0,0 +1,23 @@ +Qcow_config +Qcow_header +Qcow_error +Qcow_types +Qcow_int +Qcow_int64 +Qcow_word_size +Qcow_virtual +Qcow_physical +Qcow_s +Qcow_diet +Qcow_bitmap +Qcow_rwlock +Qcow_cache +Qcow_locks +Qcow_cluster_map +Qcow_padded +Qcow_cstructs +Qcow_recycler +Qcow_metadata +Qcow_block_cache +Qcow_debug +Qcow diff --git a/ocaml/qcow-tool/lib/qcow_bitmap.ml b/ocaml/qcow-tool/lib/qcow_bitmap.ml new file mode 100644 index 00000000000..33eab2b1310 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_bitmap.ml @@ -0,0 +1,226 @@ +(* + * Copyright (C) 2016 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +type t = {mutable buf: Cstruct.t; mutable len: int; max_len: int} + +type elt = int64 + +type interval = elt * elt + +let make_empty ~initial_size:len ~maximum_size:max_len = + let bytes_required = (len + 7) / 8 in + let buf = Cstruct.create bytes_required in + Cstruct.memset buf 0 ; {buf; len; max_len} + +let make_full ~initial_size:len ~maximum_size:max_len = + let bytes_required = (len + 7) / 8 in + let buf = Cstruct.create bytes_required in + Cstruct.memset buf 0xff ; {buf; len; max_len} + +let copy t = + let bytes_required = Cstruct.length t.buf in + let buf = Cstruct.create bytes_required in + Cstruct.blit t.buf 0 buf 0 bytes_required ; + let len = t.len in + let max_len = t.max_len in + {buf; len; max_len} + +let increase t n = + assert (n < t.max_len) ; + let rec double len = + if n >= len then double (min t.max_len (len * 2)) else len + in + let len = double t.len in + assert (len <= t.max_len) ; + assert (len > n) ; + let bytes_required = (len + 7) / 8 in + let buf = Cstruct.create bytes_required in + Cstruct.memset buf 0 ; + Cstruct.blit t.buf 0 buf 0 (Cstruct.length t.buf) ; + t.buf <- buf ; + t.len <- len + +let set' t n v = + if n >= t.max_len then + invalid_arg + (Printf.sprintf "Qcow_bitmap.set %d >= maximum_size %d" n t.max_len) ; + if n >= t.len then increase t n ; + let i = n / 8 in + let byte = Cstruct.get_uint8 t.buf i in + let byte' = + if v then + byte lor (1 lsl (n mod 8)) + else + byte land lnot (1 lsl (n mod 8)) + in + Cstruct.set_uint8 t.buf i byte' + +let get' t n = + if n >= t.len then + invalid_arg (Printf.sprintf "Qcow_bitmap.get %d >= %d" n t.len) ; + let i = n / 8 in + let byte = Cstruct.get_uint8 t.buf i in + byte land (1 lsl (n mod 8)) <> 0 + +module Interval = struct + let make x y = + if x > y then invalid_arg "Interval.make" ; + (x, y) + + let x = fst + + let y = snd +end + +let add (a, b) t = + for i = Int64.to_int a to Int64.to_int b do + set' t i true + done + +let remove (a, b) t = + for i = Int64.to_int a to Int64.to_int b do + set' t i false + done + +let min_elt t = + let rec loop from = if get' t from then from else loop (from + 1) in + try Int64.of_int @@ loop 0 with _ -> raise Not_found + +(* fold over the maximal contiguous intervals *) +let fold f t acc = + let rec loop acc from = + (* find a true element *) + let rec find from v = if get' t from = v then from else find (from + 1) v in + match find from true with + | exception Invalid_argument _ -> + (* there are no more *) + acc + | a -> + (* find a false element, up to the end of the set *) + let b = + match find a false with + | b -> + b + | exception Invalid_argument _ -> + t.len + in + let acc = f (Int64.of_int a, Int64.of_int (b - 1)) acc in + loop acc b + in + loop acc 0 + +(* fold over the maximal contiguous intervals *) +let fold_s f t acc = + let open Lwt.Infix in + let rec loop acc from = + (* find a true element *) + let rec find from v = if get' t from = v then from else find (from + 1) v in + match find from true with + | exception Invalid_argument _ -> + (* there are no more *) + Lwt.return acc + | a -> + (* find a false element, up to the end of the set *) + let b = + match find a false with + | b -> + b + | exception Invalid_argument _ -> + t.len + in + f (Int64.of_int a, Int64.of_int (b - 1)) acc >>= fun acc -> loop acc b + in + loop acc 0 + +(* fold over individual elements *) +let fold_individual f t acc = + let range (from, upto) acc = + let rec loop acc x = + if x = Int64.succ upto then acc else loop (f x acc) (Int64.succ x) + in + loop acc from + in + fold range t acc + +let elements t = fold_individual (fun x acc -> x :: acc) t [] |> List.rev + +let to_string t = + fold (fun (a, b) acc -> Printf.sprintf "%Ld - %Ld\n" a b :: acc) t [] + |> String.concat ", " + +module Int = struct + type t = int + + let compare (x : t) (y : t) = Stdlib.compare x y +end + +module IntSet = Set.Make (Int) + +module Test = struct + let make_random n m = + let diet = make_empty ~initial_size:n ~maximum_size:n in + let rec loop set = function + | 0 -> + (set, diet) + | m -> + let r = Random.int n in + let i = Interval.make (Int64.of_int r) (Int64.of_int r) in + let set, () = + if Random.bool () then + (IntSet.add r set, add i diet) + else + (IntSet.remove r set, remove i diet) + in + loop set (m - 1) + in + loop IntSet.empty m + + let check_equals set diet = + let set' = IntSet.elements set |> List.map Int64.of_int in + let diet' = elements diet in + if set' <> diet' then + (* + Printf.fprintf stderr "Set contains: [ %s ]\n" @@ set_to_string set; + Printf.fprintf stderr "Diet contains: [ %s ]\n" @@ diet_to_string diet; + *) + failwith "check_equals" + + let test_adds () = + for _ = 1 to 1000 do + let set, diet = make_random 1000 1000 in + check_equals set diet + done + + let test_add_1 () = + let t = make_empty ~initial_size:10 ~maximum_size:10 in + add (3L, 3L) t ; + add (3L, 4L) t ; + assert (elements t = [3L; 4L]) + + let test_remove_1 () = + let t = make_empty ~initial_size:10 ~maximum_size:10 in + add (7L, 8L) t ; + remove (6L, 7L) t ; + assert (elements t = [8L]) + + let all = + [ + ("adding an element to the right", test_add_1) + ; ("removing an element on the left", test_remove_1) + ; ("adding and removing elements acts like a Set", test_adds) + ] +end diff --git a/ocaml/qcow-tool/lib/qcow_bitmap.mli b/ocaml/qcow-tool/lib/qcow_bitmap.mli new file mode 100644 index 00000000000..8902f9f974c --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_bitmap.mli @@ -0,0 +1,71 @@ +(* + * Copyright (C) 2016 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +(** The type of the set elements *) +type elt = int64 + +(** An interval: a range (x, y) of set values where all the elements from + x to y inclusive are in the set *) +type interval + +module Interval : sig + val make : elt -> elt -> interval + (** [make first last] construct an interval describing all the elements from + [first] to [last] inclusive. *) + + val x : interval -> elt + (** the starting element of the interval *) + + val y : interval -> elt + (** the ending element of the interval *) +end + +(** The type of sets *) +type t + +val make_empty : initial_size:int -> maximum_size:int -> t +(** [make_empty n] creates a set of [initial_size] which can be resized up to + [maximum size], initially empty *) + +val make_full : initial_size:int -> maximum_size:int -> t +(** [make_full n] creates a set of [initial_size] which can be resized up to + [maximum size], initially full *) + +val copy : t -> t +(** [copy t] returns a duplicate of [t] *) + +val fold : (interval -> 'a -> 'a) -> t -> 'a -> 'a +(** [fold f t acc] folds [f] across all the intervals in [t] *) + +val fold_s : (interval -> 'a -> 'a Lwt.t) -> t -> 'a -> 'a Lwt.t +(** [fold_s f t acc] folds [f] across all the intervals in [t] *) + +val add : interval -> t -> unit +(** [add interval t] adds the [interval] to [t] in-place *) + +val remove : interval -> t -> unit +(** [remove interval t] removes the [interval] from [t] in-place *) + +val min_elt : t -> elt +(** [min_elt t] returns the smallest element, or raises [Not_found] if the set + is empty. *) + +val to_string : t -> string + +module Test : sig + val all : (string * (unit -> unit)) list +end diff --git a/ocaml/qcow-tool/lib/qcow_block_cache.ml b/ocaml/qcow-tool/lib/qcow_block_cache.ml new file mode 100644 index 00000000000..c0e2a01983f --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_block_cache.ml @@ -0,0 +1,310 @@ +(* + * Copyright (C) 2017 Docker Inc + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +let src = + let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in + Logs.Src.set_level src (Some Logs.Info) ; + src + +module Log = (val Logs.src_log src : Logs.LOG) + +let kib = 1024L + +let mib = Int64.mul kib 1024L + +open Qcow_types +module Cstructs = Qcow_cstructs + +module RangeLocks = struct + (** A set of exclusively locked intervals *) + type t = {mutable locked: Int64.IntervalSet.t; c: unit Lwt_condition.t} + + let create () = + let locked = Int64.IntervalSet.empty in + let c = Lwt_condition.create () in + {locked; c} + + let with_lock t i f = + let open Lwt.Infix in + let set = Int64.IntervalSet.(add i empty) in + let rec get_lock () = + if Int64.IntervalSet.(is_empty @@ inter t.locked set) then ( + t.locked <- Int64.IntervalSet.(union t.locked set) ; + Lwt.return_unit + ) else + Lwt_condition.wait t.c >>= fun () -> get_lock () + in + let put_lock () = + t.locked <- Int64.IntervalSet.(diff t.locked set) ; + Lwt.return_unit + in + get_lock () >>= fun () -> Lwt.finalize f put_lock +end + +module Make (B : Qcow_s.RESIZABLE_BLOCK) = struct + type error = B.error + + type write_error = B.write_error + + let pp_error = B.pp_error + + let pp_write_error = B.pp_write_error + + type t = { + base: B.t + ; mutable info: Mirage_block.info + ; sector_size: int + ; max_size_bytes: int64 + ; mutable in_cache: Int64.IntervalSet.t + ; mutable zeros: Int64.IntervalSet.t + ; mutable cache: Cstruct.t Int64.Map.t + ; locks: RangeLocks.t + ; mutable disconnect_request: bool + ; disconnect_m: Lwt_mutex.t + ; write_back_m: Lwt_mutex.t + ; zero: Cstruct.t + } + + let get_info t = Lwt.return t.info + + let lazy_write_back t = + let open Lwt.Infix in + Lwt_mutex.with_lock t.write_back_m (fun () -> + Log.debug (fun f -> + f "lazy_write_back cached sectors = %Ld zeros = %Ld" + (Int64.IntervalSet.cardinal t.in_cache) + (Int64.IntervalSet.cardinal t.zeros) + ) ; + assert (Int64.IntervalSet.(is_empty @@ inter t.in_cache t.zeros)) ; + (* coalesce known-zeros together with data blocks *) + let all = Int64.IntervalSet.union t.in_cache t.zeros in + Int64.IntervalSet.fold_s + (fun i err -> + match err with + | Error e -> + Lwt.return (Error e) + | Ok () -> + RangeLocks.with_lock t.locks i (fun () -> + let x, y = Int64.IntervalSet.Interval.(x i, y i) in + let mib = Int64.(div 1048576L (of_int t.sector_size)) in + (* split the interval into 1MiB chunks *) + let rec loop x y = + if x > y then + Lwt.return (Ok ()) + else + let y' = min (Int64.add x mib) y in + let rec bufs acc sector last = + if sector > last then + List.rev acc + else + let buf = + if Int64.Map.mem sector t.cache then ( + let buf = Int64.Map.find sector t.cache in + t.in_cache <- + Int64.IntervalSet.remove i t.in_cache ; + t.zeros <- Int64.IntervalSet.remove i t.zeros ; + t.cache <- Int64.Map.remove sector t.cache ; + buf + ) else + t.zero + in + bufs (buf :: acc) (Int64.succ sector) last + in + let bufs = bufs [] x y' in + B.write t.base x bufs >>= function + | Error e -> + Lwt.return (Error e) + | Ok () -> + loop (Int64.succ y') y + in + loop x y + ) + ) + all (Ok ()) + ) + + let flush t = + let open Lwt.Infix in + lazy_write_back t >>= function + | Error e -> + Lwt.return (Error e) + | Ok () -> + B.flush t.base + + let connect ?(max_size_bytes = Int64.mul 100L mib) base = + let open Lwt.Infix in + B.get_info base >>= fun info -> + let sector_size = info.Mirage_block.sector_size in + let in_cache = Int64.IntervalSet.empty in + let zeros = Int64.IntervalSet.empty in + let cache = Int64.Map.empty in + let locks = RangeLocks.create () in + let disconnect_request = false in + let disconnect_m = Lwt_mutex.create () in + let write_back_m = Lwt_mutex.create () in + let zero = Cstruct.create sector_size in + Cstruct.memset zero 0 ; + let t = + { + base + ; info + ; sector_size + ; max_size_bytes + ; in_cache + ; cache + ; zeros + ; locks + ; disconnect_request + ; disconnect_m + ; write_back_m + ; zero + } + in + Lwt.return t + + let disconnect t = + let open Lwt.Infix in + Lwt_mutex.with_lock t.disconnect_m (fun () -> + t.disconnect_request <- true ; + Lwt.return_unit + ) + >>= fun () -> + (* There can be no more in-progress writes *) + flush t >>= fun _ -> B.disconnect t.base + + (* Call [f sector buf] for every sector from [start] up to the length of [bufs] *) + let rec per_sector sector_size start bufs f = + match bufs with + | [] -> + Lwt.return (Ok ()) + | b :: bs -> ( + let open Lwt.Infix in + let rec loop sector remaining = + if Cstruct.length remaining = 0 then + Lwt.return (Ok sector) + else ( + assert (Cstruct.length remaining >= sector_size) ; + let first = Cstruct.sub remaining 0 sector_size in + f sector first >>= function + | Error e -> + Lwt.return (Error e) + | Ok () -> + loop (Int64.succ sector) (Cstruct.shift remaining sector_size) + ) + in + loop start b >>= function + | Error e -> + Lwt.return (Error e) + | Ok start' -> + per_sector sector_size start' bs f + ) + + let read t start bufs = + let len = Int64.of_int @@ Cstructs.len bufs in + let i = + Int64.IntervalSet.Interval.make start + Int64.(pred @@ add start (div len (of_int t.sector_size))) + in + let set = Int64.IntervalSet.(add i empty) in + if t.disconnect_request then + Lwt.return (Error `Disconnected) + else + RangeLocks.with_lock t.locks i (fun () -> + if Int64.IntervalSet.(is_empty @@ inter t.in_cache set) then + B.read t.base start bufs (* consider adding it to cache *) + else + per_sector t.sector_size start bufs (fun sector buf -> + if Int64.Map.mem sector t.cache then ( + let from_cache = Int64.Map.find sector t.cache in + Cstruct.blit from_cache 0 buf 0 t.sector_size ; + Lwt.return (Ok ()) + ) else + B.read t.base sector [buf] + ) + ) + + let write t start bufs = + let open Lwt.Infix in + let len = Int64.of_int @@ Cstructs.len bufs in + let current_size_bytes = + Int64.(mul (IntervalSet.cardinal t.in_cache) (of_int t.sector_size)) + in + ( if Int64.(add current_size_bytes len) > t.max_size_bytes then + lazy_write_back t + else + Lwt.return (Ok ()) + ) + >>= function + | Error e -> + Lwt.return (Error e) + | Ok () -> + let i = + Int64.IntervalSet.Interval.make start + Int64.(pred @@ add start (div len (of_int t.sector_size))) + in + (* Prevent new writes entering the cache after the disconnect has started *) + Lwt_mutex.with_lock t.disconnect_m (fun () -> + if t.disconnect_request then + Lwt.return (Error `Disconnected) + else + RangeLocks.with_lock t.locks i (fun () -> + per_sector t.sector_size start bufs (fun sector buf -> + assert (Cstruct.length buf = t.sector_size) ; + if not (Int64.Map.mem sector t.cache) then ( + t.in_cache <- Int64.IntervalSet.(add i t.in_cache) ; + t.zeros <- Int64.IntervalSet.(remove i t.zeros) + ) ; + t.cache <- Int64.Map.add sector buf t.cache ; + Lwt.return (Ok ()) + ) + ) + ) + + let resize t new_size = + let open Lwt.Infix in + B.resize t.base new_size >>= function + | Error e -> + Lwt.return (Error e) + | Ok () -> + (* If the file has become smaller, drop cached blocks beyond the new file + size *) + if new_size < t.info.Mirage_block.size_sectors then ( + let still_ok, to_drop = + Int64.Map.partition (fun sector _ -> sector < new_size) t.cache + in + let to_drop' = + Int64.Map.fold + (fun sector _ set -> + let i = Int64.IntervalSet.Interval.make sector sector in + Int64.IntervalSet.(add i set) + ) + to_drop Int64.IntervalSet.empty + in + t.cache <- still_ok ; + t.in_cache <- Int64.IntervalSet.diff t.in_cache to_drop' + ) ; + (* If the file has become bigger, we know the new blocks contain zeroes *) + ( if new_size > t.info.Mirage_block.size_sectors then + let i = + Int64.IntervalSet.Interval.make t.info.Mirage_block.size_sectors + (Int64.pred new_size) + in + t.zeros <- Int64.IntervalSet.add i t.zeros + ) ; + t.info <- {t.info with Mirage_block.size_sectors= new_size} ; + Lwt.return (Ok ()) +end diff --git a/ocaml/qcow-tool/lib/qcow_block_cache.mli b/ocaml/qcow-tool/lib/qcow_block_cache.mli new file mode 100644 index 00000000000..9f4ea899b72 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_block_cache.mli @@ -0,0 +1,25 @@ +(* + * Copyright (C) 2017 Docker Inc + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +module Make (B : Qcow_s.RESIZABLE_BLOCK) : sig + include Qcow_s.RESIZABLE_BLOCK + + val connect : ?max_size_bytes:int64 -> B.t -> t Lwt.t + (** [connect ?max_size_bytes b] constructs a cache over [b] with a maximum + memory footprint of [max_size_bytes]. Writes are heavily cached and only + written to disk on a flush, disconnect or when out of space. *) +end diff --git a/ocaml/qcow-tool/lib/qcow_cache.ml b/ocaml/qcow-tool/lib/qcow_cache.ml new file mode 100644 index 00000000000..29be21ca7bd --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_cache.ml @@ -0,0 +1,136 @@ +(* + * Copyright (C) 2017 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Qcow_types + +let src = + let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in + Logs.Src.set_level src (Some Logs.Info) ; + src + +module Log = (val Logs.src_log src : Logs.LOG) + +type t = { + read_cluster: Cluster.t -> (Cstruct.t, Mirage_block.error) result Lwt.t + ; write_cluster: + Cluster.t -> Cstruct.t -> (unit, Mirage_block.write_error) result Lwt.t + ; mutable clusters: Cstruct.t Cluster.Map.t +} + +let create ~read_cluster ~write_cluster () = + let clusters = Cluster.Map.empty in + {read_cluster; write_cluster; clusters} + +let read t cluster = + if Cluster.Map.mem cluster t.clusters then + let data = Cluster.Map.find cluster t.clusters in + Lwt.return (Ok data) + else + let open Lwt.Infix in + t.read_cluster cluster >>= function + | Error e -> + Lwt.return (Error e) + | Ok data -> + t.clusters <- Cluster.Map.add cluster data t.clusters ; + Lwt.return (Ok data) + +let write t cluster data = + if not (Cluster.Map.mem cluster t.clusters) then ( + Log.err (fun f -> + f + "Cache.write %s: cluster is nolonger in cache, so update will be \ + dropped" + (Cluster.to_string cluster) + ) ; + assert false + ) ; + t.clusters <- Cluster.Map.add cluster data t.clusters ; + t.write_cluster cluster data + +let remove t cluster = + if Cluster.Map.mem cluster t.clusters then + Printf.fprintf stderr "Dropping cache for cluster %s\n" + (Cluster.to_string cluster) ; + t.clusters <- Cluster.Map.remove cluster t.clusters + +let resize t new_size_clusters = + let to_keep, to_drop = + Cluster.Map.partition + (fun cluster _ -> cluster < new_size_clusters) + t.clusters + in + t.clusters <- to_keep ; + if not (Cluster.Map.is_empty to_drop) then + Log.info (fun f -> + f "After file resize dropping cached clusters: %s" + (String.concat ", " + @@ List.map Cluster.to_string + @@ List.map fst + @@ Cluster.Map.bindings to_drop + ) + ) + +module Debug = struct + let assert_not_cached t cluster = + if Cluster.Map.mem cluster t.clusters then ( + Printf.fprintf stderr "Cluster %s still in the metadata cache\n" + (Cluster.to_string cluster) ; + assert false + ) + + let all_cached_clusters t = + Cluster.Map.fold + (fun cluster _ set -> + Cluster.IntervalSet.(add (Interval.make cluster cluster) set) + ) + t.clusters Cluster.IntervalSet.empty + + let check_disk t = + let open Lwt.Infix in + let rec loop = function + | [] -> + Lwt.return (Ok ()) + | (cluster, expected) :: rest -> ( + (t.read_cluster cluster >>= function + | Error e -> + Lwt.return (Error e) + | Ok data -> + if not (Cstruct.equal expected data) then ( + Log.err (fun f -> + f "Cache for cluster %s disagrees with disk" + (Cluster.to_string cluster) + ) ; + Log.err (fun f -> f "Cached:") ; + let buffer = Buffer.create 65536 in + Cstruct.hexdump_to_buffer buffer expected ; + Log.err (fun f -> f "%s" (Buffer.contents buffer)) ; + let buffer = Buffer.create 65536 in + Cstruct.hexdump_to_buffer buffer data ; + Log.err (fun f -> f "On disk:") ; + Log.err (fun f -> f "%s" (Buffer.contents buffer)) ; + Lwt.return (Ok ()) + ) else + Lwt.return (Ok ()) + ) + >>= function + | Error e -> + Lwt.return (Error e) + | Ok () -> + loop rest + ) + in + loop (Cluster.Map.bindings t.clusters) +end diff --git a/ocaml/qcow-tool/lib/qcow_cache.mli b/ocaml/qcow-tool/lib/qcow_cache.mli new file mode 100644 index 00000000000..51a56c573fb --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_cache.mli @@ -0,0 +1,50 @@ +(* + * Copyright (C) 2017 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Qcow_types + +(** A cache of clusters *) +type t + +val create : + read_cluster:(Cluster.t -> (Cstruct.t, Mirage_block.error) result Lwt.t) + -> write_cluster: + (Cluster.t -> Cstruct.t -> (unit, Mirage_block.write_error) result Lwt.t) + -> unit + -> t +(** Create a cache of clusters, given the read/write functions *) + +val read : t -> Cluster.t -> (Cstruct.t, Mirage_block.error) result Lwt.t +(** [read t cluster] returns the data in [cluster] *) + +val write : + t -> Cluster.t -> Cstruct.t -> (unit, Mirage_block.write_error) result Lwt.t +(** [write t cluster data] writes [data] to [cluster] *) + +val remove : t -> Cluster.t -> unit +(** [remove t cluster] drops any cache associated with [cluster] *) + +val resize : t -> Cluster.t -> unit +(** [resize t new_size_clusters] drops any cache entries which are beyond the new + file size. *) + +module Debug : sig + val assert_not_cached : t -> Cluster.t -> unit + + val all_cached_clusters : t -> Cluster.IntervalSet.t + + val check_disk : t -> (unit, Mirage_block.error) result Lwt.t +end diff --git a/ocaml/qcow-tool/lib/qcow_cluster_map.ml b/ocaml/qcow-tool/lib/qcow_cluster_map.ml new file mode 100644 index 00000000000..47f92923cb7 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_cluster_map.ml @@ -0,0 +1,1036 @@ +(* + * Copyright (C) 2016 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +let src = + let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in + Logs.Src.set_level src (Some Logs.Info) ; + src + +module Log = (val Logs.src_log src : Logs.LOG) + +open Qcow_types +open Prometheus + +module Metrics = struct + let namespace = "Mirage" + + let subsystem = "qcow" + + let label_name = "id" + + let used = + let help = "Number of clusters containing user data" in + Gauge.v_label ~label_name ~help ~namespace ~subsystem "used" + + let junk = + let help = "Number of junk clusters created" in + Counter.v_label ~label_name ~help ~namespace ~subsystem "junk" + + let erased = + let help = "Number of clusters erased" in + Counter.v_label ~label_name ~help ~namespace ~subsystem "erased" + + let available = + let help = "Number of clusters made available for reallocation" in + Counter.v_label ~label_name ~help ~namespace ~subsystem "available" + + let roots = + let help = "Number of GC root clusters registered" in + Counter.v_label ~label_name ~help ~namespace ~subsystem "roots" + + let copied = + let help = "Number of cluster copies completed" in + Counter.v_label ~label_name ~help ~namespace ~subsystem "copied" + + let flushed = + let help = "Number of cluster copies flushed" in + Counter.v_label ~label_name ~help ~namespace ~subsystem "flushed" + + let referenced = + let help = "Number of references updated" in + Counter.v_label ~label_name ~help ~namespace ~subsystem "referenced" + + let size = + let help = "File size in clusters" in + Gauge.v_label ~label_name ~help ~namespace ~subsystem "size" +end + +module Cache = Qcow_cache +module Error = Qcow_error + +type reference = Cluster.t * int + +let string_of_reference (c, w) = Cluster.to_string c ^ ":" ^ string_of_int w + +type move_state = Copying | Copied | Flushed | Referenced + +let string_of_move_state = function + | Copying -> + "Copying" + | Copied -> + "Copied" + | Flushed -> + "Flushed" + | Referenced -> + "Referenced" + +module Move = struct + type t = {src: Cluster.t; dst: Cluster.t} + + let to_string t = + Printf.sprintf "%s -> %s" (Cluster.to_string t.src) (Cluster.to_string t.dst) +end + +type move = {move: Move.t; state: move_state} + +let string_of_move m = + let state = string_of_move_state m.state in + Printf.sprintf "%s %s" (Move.to_string m.move) state + +type t = { + mutable all: Cluster.IntervalSet.t (** Represents the whole file *) + ; mutable junk: Cluster.IntervalSet.t + (** These are unused clusters containing arbitrary data. They must be erased + or fully overwritten and then flushed in order to be safely reused. *) + ; mutable erased: Cluster.IntervalSet.t + ; (* These are clusters which have been erased, but not flushed. They will become + available for reallocation on the next flush. *) + mutable available: Cluster.IntervalSet.t + (** These clusters are available for immediate reuse; after a crash they are + guaranteed to be full of zeroes. *) + ; mutable roots: Cluster.IntervalSet.t + ; (* map from physical cluster to the physical cluster + offset of the reference. + When a block is moved, this reference must be updated. *) + mutable copies: Cluster.IntervalSet.t + (** Clusters which contain clusters as part of a move *) + ; mutable moves: move Cluster.Map.t + (** The state of in-progress block moves, indexed by the source cluster *) + ; mutable refs: reference Cluster.Map.t + ; first_movable_cluster: Cluster.t + ; cache: Cache.t + ; c: unit Lwt_condition.t + (** Signalled when any of the junk/erased sets change or when references need + to be rewritten to kick the background recycling thread. *) + ; runtime_asserts: bool + (** Check leak and sharing invariants on every update *) + ; id: string option (** value of the `id` label used in Metrics above *) + ; cluster_size: int + (** the number of bytes in a cluster, for computing Metrics in bytes *) +} + +let get_last_block t = + let max_ref = + try fst @@ Cluster.Map.max_binding t.refs with Not_found -> Cluster.zero + in + let max_root = + try Cluster.IntervalSet.Interval.y @@ Cluster.IntervalSet.max_elt t.roots + with Not_found -> Cluster.zero + in + let max_copies = + try Cluster.IntervalSet.Interval.y @@ Cluster.IntervalSet.max_elt t.copies + with Not_found -> Cluster.zero + in + let max_move = + try fst @@ Cluster.Map.max_binding t.moves with Not_found -> Cluster.zero + in + max (Cluster.pred t.first_movable_cluster) + @@ max max_ref + @@ max max_root + @@ max max_move max_copies + +let total_used t = Int64.of_int @@ Cluster.Map.cardinal t.refs + +let total_free t = Cluster.to_int64 @@ Cluster.IntervalSet.cardinal t.junk + +let total_erased t = Cluster.to_int64 @@ Cluster.IntervalSet.cardinal t.erased + +let total_available t = + Cluster.to_int64 @@ Cluster.IntervalSet.cardinal t.available + +let total_moves t = + Cluster.Map.fold + (fun _ m (copying, copied, flushed, referenced) -> + match m.state with + | Copying -> + (copying + 1, copied, flushed, referenced) + | Copied -> + (copying, copied + 1, flushed, referenced) + | Flushed -> + (copying, copied, flushed + 1, referenced) + | Referenced -> + (copying, copied, flushed, referenced + 1) + ) + t.moves (0, 0, 0, 0) + +let total_copies t = Cluster.to_int64 @@ Cluster.IntervalSet.cardinal t.copies + +let total_roots t = Cluster.to_int64 @@ Cluster.IntervalSet.cardinal t.roots + +let to_summary_string t = + let copying, copied, flushed, referenced = total_moves t in + Printf.sprintf + "%Ld used; %Ld junk; %Ld erased; %Ld available; %Ld copies; %Ld roots; %d \ + Copying; %d Copied; %d Flushed; %d Referenced; max_cluster = %s" + (total_used t) (total_free t) (total_erased t) (total_available t) + (total_copies t) (total_roots t) copying copied flushed referenced + (Cluster.to_string @@ get_last_block t) + +module Debug = struct + let check ?(leaks = true) ?(sharing = true) t = + let open Cluster.IntervalSet in + let last = get_last_block t in + if last >= t.first_movable_cluster then ( + let whole_file = add (Interval.make t.first_movable_cluster last) empty in + let refs = + Cluster.Map.fold + (fun cluster _ set -> add (Interval.make cluster cluster) set) + t.refs empty + in + let moves = + Cluster.Map.fold + (fun _ m set -> + let dst = m.move.Move.dst in + add (Interval.make dst dst) set + ) + t.moves empty + in + let junk = ("junk", t.junk) in + let erased = ("erased", t.erased) in + let available = ("available", t.available) in + let refs = ("refs", refs) in + let moves = ("moves", moves) in + let copies = ("copies", t.copies) in + let roots = ("roots", t.roots) in + let cached = ("cached", Cache.Debug.all_cached_clusters t.cache) in + let all = [junk; erased; available; refs; moves; copies; roots] in + let leaked = List.fold_left diff whole_file (List.map snd all) in + if leaks && cardinal leaked <> Cluster.zero then ( + Log.err (fun f -> f "%s" (to_summary_string t)) ; + Log.err (fun f -> + f "%s clusters leaked: %s" + (Cluster.to_string @@ cardinal leaked) + (Sexplib.Sexp.to_string_hum (sexp_of_t leaked)) + ) ; + assert false + ) ; + let rec cross xs = function + | [] -> + [] + | y :: ys -> + List.map (fun x -> (x, y)) xs @ cross xs ys + in + let check zs = + List.iter + (fun ((x_name, x), (y_name, y)) -> + if x_name <> y_name then + let i = inter x y in + if cardinal i <> Cluster.zero then ( + Log.err (fun f -> f "%s" (to_summary_string t)) ; + Log.err (fun f -> f "%s and %s are not disjoint" x_name y_name) ; + Log.err (fun f -> + f "%s = %s" x_name (Sexplib.Sexp.to_string_hum (sexp_of_t x)) + ) ; + Log.err (fun f -> + f "%s = %s" y_name (Sexplib.Sexp.to_string_hum (sexp_of_t y)) + ) ; + Log.err (fun f -> + f "intersection = %s" + (Sexplib.Sexp.to_string_hum (sexp_of_t i)) + ) ; + assert false + ) + ) + zs + in + (* These must be disjoint *) + if sharing then ( + check + @@ cross [junk; erased; available; refs] [junk; erased; available; refs] ; + check + @@ cross + [junk; copies; erased; available] + [junk; copies; erased; available] ; + check @@ cross [cached] [junk; erased; available] + ) ; + (* moves and copies should be the same *) + let d = + union (diff (snd copies) (snd moves)) (diff (snd moves) (snd copies)) + in + if not (is_empty d) then ( + Log.err (fun f -> f "%s" (to_summary_string t)) ; + Log.err (fun f -> f "moves and refs are not the same") ; + Log.err (fun f -> + f "moves = %s" (Sexplib.Sexp.to_string_hum (sexp_of_t (snd moves))) + ) ; + Log.err (fun f -> + f "refs = %s" (Sexplib.Sexp.to_string_hum (sexp_of_t (snd refs))) + ) ; + Log.err (fun f -> + f "diff = %s" (Sexplib.Sexp.to_string_hum (sexp_of_t d)) + ) + ) + ) + + let assert_no_leaked_blocks t = check t + + let assert_equal a b = + let map_equals name pp a b = + Cluster.Map.fold + (fun k v acc -> + let v' = try Some (Cluster.Map.find k b) with Not_found -> None in + if Some v <> v' then ( + Log.err (fun f -> + f "%s: a has cluster %s -> %s but b has cluster %s -> %s" name + (Cluster.to_string k) (pp v) (Cluster.to_string k) + (match v' with None -> "None" | Some v -> pp v) + ) ; + false + ) else + acc + ) + a true + in + let moves = map_equals "moves" string_of_move a.moves b.moves in + let refs = map_equals "refs" string_of_reference a.refs b.refs in + let first_movable_cluster = + if a.first_movable_cluster <> b.first_movable_cluster then ( + Log.err (fun f -> + f + "a has first_movable_cluster = %s but b has \ + first_movable_cluster = %s" + (Cluster.to_string a.first_movable_cluster) + (Cluster.to_string b.first_movable_cluster) + ) ; + false + ) else + true + in + if not (moves && refs && first_movable_cluster) then + failwith "cluster maps are different" + + let metadata_blocks t = + let open Cluster.IntervalSet in + let header = + add + (Interval.make Cluster.zero (Cluster.pred t.first_movable_cluster)) + empty + in + (* All clusters which reference other clusters must be metadata *) + Cluster.Map.fold + (fun _ (cluster, _) set -> add (Interval.make cluster cluster) set) + t.refs header +end + +module type MutableSet = sig + val get : t -> Cluster.IntervalSet.t + + val remove : t -> Cluster.IntervalSet.t -> unit + + val mem : t -> Cluster.t -> bool +end + +let make ~free ~refs ~cache ~first_movable_cluster ~runtime_asserts ~id + ~cluster_size = + let junk = + Qcow_bitmap.fold + (fun i acc -> + let x, y = Qcow_bitmap.Interval.(x i, y i) in + let x = Cluster.of_int64 x and y = Cluster.of_int64 y in + Cluster.IntervalSet.(add (Interval.make x y) acc) + ) + free Cluster.IntervalSet.empty + in + let copies = Cluster.IntervalSet.empty in + let roots = Cluster.IntervalSet.empty in + let available = Cluster.IntervalSet.empty in + let erased = Cluster.IntervalSet.empty in + let moves = Cluster.Map.empty in + let c = Lwt_condition.create () in + let last = + let last_header = Cluster.pred first_movable_cluster in + let last_ref = + try fst @@ Cluster.Map.max_binding refs with Not_found -> Cluster.zero + in + let last_free = + try Cluster.IntervalSet.Interval.y @@ Cluster.IntervalSet.max_elt junk + with Not_found -> Cluster.zero + in + max last_header (max last_ref last_free) + in + let all = Cluster.IntervalSet.(add (Interval.make Cluster.zero last) empty) in + ( match id with + | Some id -> + Counter.inc (Metrics.junk id) + (float_of_int cluster_size + *. (Cluster.to_float @@ Cluster.IntervalSet.cardinal junk) + ) ; + Gauge.set (Metrics.used id) + (float_of_int cluster_size + *. (float_of_int @@ Cluster.Map.cardinal refs) + ) + | None -> + () + ) ; + { + all + ; junk + ; available + ; erased + ; copies + ; roots + ; moves + ; refs + ; first_movable_cluster + ; cache + ; c + ; runtime_asserts + ; id + ; cluster_size + } + +let zero = + let free = Qcow_bitmap.make_empty ~initial_size:0 ~maximum_size:0 in + let refs = Cluster.Map.empty in + let cache = + Cache.create + ~read_cluster:(fun _ -> Lwt.fail (Failure "Unimplemented")) + ~write_cluster:(fun _ _ -> Lwt.fail (Failure "Unimplemented")) + () + in + make ~free ~refs ~first_movable_cluster:Cluster.zero ~cache + ~runtime_asserts:false ~id:None ~cluster_size:0 + +let resize t new_size_clusters = + let open Cluster.IntervalSet in + let file = + add (Interval.make Cluster.zero (Cluster.pred new_size_clusters)) empty + in + Cache.resize t.cache new_size_clusters ; + t.junk <- inter t.junk file ; + t.erased <- inter t.erased file ; + t.available <- inter t.available file ; + (* New blocks on the end of the file are assumed to be zeroed and therefore available *) + let zeroed = diff file t.all in + t.available <- union t.available zeroed ; + ( match t.id with + | None -> + () + | Some id -> + Counter.inc (Metrics.available id) + (float_of_int t.cluster_size + *. (Cluster.to_float @@ Cluster.IntervalSet.cardinal zeroed) + ) + ) ; + if cardinal zeroed > Cluster.zero then + Log.info (fun f -> + f "resize: adding available clusters %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t zeroed) + ) ; + t.all <- file ; + match t.id with + | None -> + () + | Some id -> + Gauge.set (Metrics.size id) + (Cluster.to_float new_size_clusters *. float_of_int t.cluster_size) + +module Junk = struct + let get t = t.junk + + let add t more = + Log.debug (fun f -> + f "Junk.add %s" + (Sexplib.Sexp.to_string (Cluster.IntervalSet.sexp_of_t more)) + ) ; + t.junk <- Cluster.IntervalSet.union t.junk more ; + ( match t.id with + | None -> + () + | Some id -> + Counter.inc (Metrics.junk id) + (float_of_int t.cluster_size + *. (Cluster.to_float @@ Cluster.IntervalSet.cardinal more) + ) + ) ; + (* Ensure all cached copies of junk blocks are dropped *) + Cluster.IntervalSet.( + fold + (fun i () -> + let x, y = Interval.(x i, y i) in + let rec loop n = + if n <= y then ( + Cache.remove t.cache n ; + loop (Cluster.succ n) + ) + in + loop x + ) + more () + ) ; + if t.runtime_asserts then Debug.check ~leaks:false t ; + Lwt_condition.signal t.c () + + let remove t less = + let open Cluster.IntervalSet in + let old_junk = t.junk in + t.junk <- Cluster.IntervalSet.diff t.junk less ; + if Cluster.sub (cardinal old_junk) (cardinal less) <> cardinal t.junk then ( + Log.err (fun f -> f "Junk.remove: clusters were not in junk") ; + Log.err (fun f -> + f "Junk = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t old_junk) + ) ; + Log.err (fun f -> + f "To remove = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t less) + ) ; + failwith "Junk.remove: clusters were not in junk" + ) ; + Lwt_condition.signal t.c () + + let mem t elt = Cluster.IntervalSet.mem elt t.junk +end + +module Available = struct + let get t = t.available + + let add t more = + let open Cluster.IntervalSet in + Log.debug (fun f -> + f "Available.add %s" (Sexplib.Sexp.to_string (sexp_of_t more)) + ) ; + t.available <- union t.available more ; + ( match t.id with + | None -> + () + | Some id -> + Counter.inc (Metrics.available id) + (float_of_int t.cluster_size + *. (Cluster.to_float @@ Cluster.IntervalSet.cardinal more) + ) + ) ; + if t.runtime_asserts then Debug.check ~leaks:false t ; + Lwt_condition.signal t.c () + + let remove t less = + let open Cluster.IntervalSet in + Log.debug (fun f -> + f "Available.remove %s" (Sexplib.Sexp.to_string @@ sexp_of_t less) + ) ; + let old_available = t.available in + t.available <- Cluster.IntervalSet.diff t.available less ; + if + Cluster.sub (cardinal old_available) (cardinal less) + <> cardinal t.available + then ( + Log.err (fun f -> f "Available.remove: clusters were not in junk") ; + Log.err (fun f -> + f "Available = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t old_available) + ) ; + Log.err (fun f -> + f "To remove = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t less) + ) ; + failwith "Available.remove: clusters were not in available" + ) ; + Lwt_condition.signal t.c () + + let mem t elt = Cluster.IntervalSet.mem elt t.available +end + +module Erased = struct + let get t = t.erased + + let add t more = + Log.debug (fun f -> + f "Erased.add %s" + (Sexplib.Sexp.to_string (Cluster.IntervalSet.sexp_of_t more)) + ) ; + t.erased <- Cluster.IntervalSet.union t.erased more ; + ( match t.id with + | None -> + () + | Some id -> + Counter.inc (Metrics.erased id) + (float_of_int t.cluster_size + *. (Cluster.to_float @@ Cluster.IntervalSet.cardinal more) + ) + ) ; + if t.runtime_asserts then Debug.check ~leaks:false t ; + Lwt_condition.signal t.c () + + let remove t less = + let open Cluster.IntervalSet in + let old_erased = t.erased in + t.erased <- diff t.erased less ; + if Cluster.sub (cardinal old_erased) (cardinal less) <> cardinal t.erased + then ( + Log.err (fun f -> f "Erased.remove: clusters were not in erased") ; + Log.err (fun f -> + f "Erased = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t old_erased) + ) ; + Log.err (fun f -> + f "To remove = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t less) + ) ; + failwith "Erased.remove: clusters were not in erased" + ) ; + Lwt_condition.signal t.c () + + let mem t elt = Cluster.IntervalSet.mem elt t.erased +end + +module Copies = struct + let get t = t.erased + + let add t more = + Log.debug (fun f -> + f "Copies.add %s" + (Sexplib.Sexp.to_string (Cluster.IntervalSet.sexp_of_t more)) + ) ; + t.copies <- Cluster.IntervalSet.union t.copies more ; + if t.runtime_asserts then Debug.check ~leaks:false t ; + Lwt_condition.signal t.c () + + let remove t less = + let open Cluster.IntervalSet in + let old_copies = t.copies in + t.copies <- diff t.copies less ; + if Cluster.sub (cardinal old_copies) (cardinal less) <> cardinal t.copies + then ( + Log.err (fun f -> f "Copies.remove: clusters were not in copies") ; + Log.err (fun f -> + f "Copies = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t old_copies) + ) ; + Log.err (fun f -> + f "To remove = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t less) + ) ; + failwith "Copies.remove: clusters were not in copies" + ) ; + Lwt_condition.signal t.c () + + let mem t elt = Cluster.IntervalSet.mem elt t.copies +end + +module Roots = struct + let get t = t.roots + + let add t more = + let open Cluster.IntervalSet in + Log.debug (fun f -> + f "Roots.add %s" (Sexplib.Sexp.to_string (sexp_of_t more)) + ) ; + let intersection = inter more t.roots in + if not @@ is_empty @@ intersection then ( + Log.err (fun f -> + f "Clusters are already registered as roots: %s" + (Sexplib.Sexp.to_string @@ sexp_of_t more) + ) ; + Log.err (fun f -> + f "Intersection: %s" (Sexplib.Sexp.to_string @@ sexp_of_t intersection) + ) ; + assert false + ) ; + t.roots <- union t.roots more ; + ( match t.id with + | None -> + () + | Some id -> + Counter.inc (Metrics.roots id) + (float_of_int t.cluster_size + *. (Cluster.to_float @@ Cluster.IntervalSet.cardinal more) + ) + ) ; + if t.runtime_asserts then Debug.check ~leaks:false t ; + Lwt_condition.signal t.c () + + let remove t less = + let open Cluster.IntervalSet in + let old_roots = t.roots in + t.roots <- diff t.roots less ; + if Cluster.sub (cardinal old_roots) (cardinal less) <> cardinal t.roots then ( + Log.err (fun f -> f "Roots.remove: clusters were not in roots") ; + Log.err (fun f -> + f "Roots = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t old_roots) + ) ; + Log.err (fun f -> + f "To remove = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t less) + ) ; + failwith "Roots.remove: clusters were not in roots" + ) ; + Lwt_condition.signal t.c () + + let mem t elt = Cluster.IntervalSet.mem elt t.roots +end + +type cluster_state = Junk | Erased | Available | Copies | Roots + +let set_cluster_state t set src dst = + ( match src with + | Junk -> + Junk.remove t set + | Erased -> + Erased.remove t set + | Available -> + Available.remove t set + | Copies -> + Copies.remove t set + | Roots -> + Roots.remove t set + ) ; + match dst with + | Junk -> + Junk.add t set + | Erased -> + Erased.add t set + | Available -> + Available.add t set + | Copies -> + Copies.add t set + | Roots -> + Roots.add t set + +let wait t = Lwt_condition.wait t.c + +let find t cluster = Cluster.Map.find cluster t.refs + +let moves t = t.moves + +let set_move_state t move state = + let m = {move; state} in + let old_state = + if Cluster.Map.mem move.Move.src t.moves then + Some (Cluster.Map.find move.Move.src t.moves).state + else + None + in + let src = move.Move.src in + let dst = move.Move.dst in + if Cluster.Map.mem dst t.moves then ( + let {move= dst_move; state= dst_state} = Cluster.Map.find dst t.moves in + Log.err (fun f -> + f + "Illegal cluster move: %s -> %s but destination is already moving \ + from %s -> %s and in state %s" + (Cluster.to_string move.Move.src) + (Cluster.to_string move.Move.dst) + (Cluster.to_string dst_move.Move.src) + (Cluster.to_string dst_move.Move.dst) + (string_of_move_state dst_state) + ) ; + assert false + ) ; + match (old_state, state) with + | None, Copying -> + let open Cluster.IntervalSet in + let dst' = add (Interval.make dst dst) empty in + (* We always move into junk blocks *) + if not @@ mem dst t.junk then ( + Log.err (fun f -> + f + "Copying cluster from %s -> %s: destination is not in the Junk \ + set" + (Cluster.to_string move.Move.src) + (Cluster.to_string move.Move.dst) + ) ; + Log.err (fun f -> + f "Junk = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t t.junk) + ) ; + assert false + ) ; + if mem dst t.copies then ( + Log.err (fun f -> + f + "Copying cluster from %s -> %s: destination is already in the \ + Copies set" + (Cluster.to_string move.Move.src) + (Cluster.to_string move.Move.dst) + ) ; + Log.err (fun f -> + f "Copies = %s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t t.copies) + ) ; + assert false + ) ; + if Cluster.Map.mem dst t.moves then ( + Log.err (fun f -> + f + "Copying cluster from %s -> %s: destination is already in the \ + moves list" + (Cluster.to_string move.Move.src) + (Cluster.to_string move.Move.dst) + ) ; + assert false + ) ; + if Cluster.Map.mem src t.moves then ( + Log.err (fun f -> + f + "Copying cluster from %s -> %s: source is already in the moves \ + list" + (Cluster.to_string move.Move.src) + (Cluster.to_string move.Move.dst) + ) ; + assert false + ) ; + Log.debug (fun f -> + f "Cluster %s None -> Copying" (Cluster.to_string move.Move.src) + ) ; + Junk.remove t dst' ; + t.moves <- Cluster.Map.add move.Move.src m t.moves ; + Copies.add t dst' + | Some Copying, Copied -> ( + Log.debug (fun f -> + f "Cluster %s Copying -> Copied" (Cluster.to_string move.Move.src) + ) ; + t.moves <- Cluster.Map.add move.Move.src m t.moves ; + match t.id with + | None -> + () + | Some id -> + Counter.inc (Metrics.copied id) (float_of_int t.cluster_size) + ) + | Some Copied, Flushed -> + Log.debug (fun f -> + f "Cluster %s Copied -> Flushed" (Cluster.to_string move.Move.src) + ) ; + t.moves <- Cluster.Map.add move.Move.src m t.moves ; + ( match t.id with + | None -> + () + | Some id -> + Counter.inc (Metrics.flushed id) (float_of_int t.cluster_size) + ) ; + (* References now need to be rewritten *) + Lwt_condition.signal t.c () + | Some Flushed, Referenced -> ( + Log.debug (fun f -> + f "Cluster %s Flushed -> Referenced" (Cluster.to_string move.Move.src) + ) ; + t.moves <- Cluster.Map.add move.Move.src m t.moves ; + match t.id with + | None -> + () + | Some id -> + Counter.inc (Metrics.referenced id) (float_of_int t.cluster_size) + ) + | Some old, _ -> + Log.err (fun f -> + f "Illegal cluster move state transition: %s %s -> %s" + (Cluster.to_string move.Move.src) + (string_of_move_state old) + (string_of_move_state state) + ) ; + assert false + | None, _ -> + Log.warn (fun f -> + f "Not updating move state of cluster %s: operation cancelled" + (Cluster.to_string move.Move.src) + ) + +let cancel_move t cluster = + match Cluster.Map.find cluster t.moves with + | {state= Referenced; _} -> + (* The write will have followed the reference to the destination block. + There are 2 interesting possibilities if we crash without flushing: + - neither the write nor the reference are committed: this behaves as if + the write wasn't committed which is valid + - the write is committed but the reference isn't: this also behaves + as if the write wasn't committed which is valid + The only reason we still track this move is because when the next flush + happens it is safe to add the src cluster to the set of junk blocks. *) + Log.debug (fun f -> + f "Not cancelling in-progress move of cluster %s: already Referenced" + (Cluster.to_string cluster) + ) + | {move= {Move.dst; _}; _} -> + Log.debug (fun f -> + f "Cancelling in-progress move of cluster %s to %s" + (Cluster.to_string cluster) + (Cluster.to_string dst) + ) ; + t.moves <- Cluster.Map.remove cluster t.moves ; + let dst' = Cluster.IntervalSet.(add (Interval.make dst dst) empty) in + (* The destination block can now be recycled *) + Copies.remove t dst' ; Junk.add t dst' + | exception Not_found -> + () + +let complete_move t move = + let old_state = + if Cluster.Map.mem move.Move.src t.moves then + Some (Cluster.Map.find move.Move.src t.moves).state + else + None + in + match old_state with + | None -> + Log.warn (fun f -> + f "Not completing move state of cluster %s: operation cancelled" + (Cluster.to_string move.Move.src) + ) + | Some Referenced -> + t.moves <- Cluster.Map.remove move.Move.src t.moves ; + let dst = + Cluster.IntervalSet.( + add (Interval.make move.Move.dst move.Move.dst) empty + ) + in + Copies.remove t dst + (* The source block will have already been added to Junk by the Metadata.Physical.set *) + | Some old -> + Log.err (fun f -> + f "Illegal cluster move state transition: %s %s -> Completed" + (Cluster.to_string move.Move.src) + (string_of_move_state old) + ) ; + failwith "Attempt to complete an incomplete cluster move" + +let is_moving t src = Cluster.Map.mem src t.moves + +let add t rf cluster = + let c, w = rf in + if cluster = Cluster.zero then + () + else ( + if Cluster.Map.mem cluster t.refs then ( + let c', w' = Cluster.Map.find cluster t.refs in + Log.err (fun f -> + f "Found two references to cluster %s: %s.%d and %s.%d" + (Cluster.to_string cluster) + (Cluster.to_string c) w (Cluster.to_string c') w' + ) ; + raise + (Error.Duplicate_reference + ( (Cluster.to_int64 c, w) + , (Cluster.to_int64 c', w') + , Cluster.to_int64 cluster + ) + ) + ) ; + let junk = Junk.mem t cluster in + let erased = Erased.mem t cluster in + let available = Available.mem t cluster in + let roots = Roots.mem t cluster in + let copies = Copies.mem t cluster in + if (not (roots || copies)) || junk || erased || available then ( + Log.err (fun f -> + f + "Adding a reference to cluster %s in %s.%d, cluster in state: %s \ + %s %s %s %s" + (Cluster.to_string cluster) + (Cluster.to_string c) w + (if junk then "Junk " else "") + (if erased then "Erased " else "") + (if available then "Available " else "") + (if roots then "Roots " else "") + (if copies then "Copies " else "") + ) ; + failwith + (Printf.sprintf "Adding a reference to unsuitable cluster %s in %s.%d" + (Cluster.to_string cluster) + (Cluster.to_string c) w + ) + ) ; + t.refs <- Cluster.Map.add cluster rf t.refs ; + ( match t.id with + | None -> + () + | Some id -> + Gauge.inc (Metrics.used id) (float_of_int t.cluster_size) + ) ; + () + ) + +let remove t cluster = + t.refs <- Cluster.Map.remove cluster t.refs ; + ( match t.id with + | None -> + () + | Some id -> + Gauge.dec (Metrics.used id) (float_of_int t.cluster_size) + ) ; + Junk.add t Cluster.IntervalSet.(add (Interval.make cluster cluster) empty) ; + cancel_move t cluster ; + Lwt_condition.signal t.c () + +let start_moves t = + (* The last allocated block. Note if there are no data blocks this will + point to the last header block even though it is immovable. *) + let max_cluster = get_last_block t in + let refs = ref t.refs in + let moves = + fst + @@ Cluster.IntervalSet.fold_individual + (fun cluster (moves, max_cluster) -> + (* A free block after the last allocated block will not be filled. + It will be erased from existence when the file is truncated at the + end. *) + if cluster >= max_cluster then + (moves, max_cluster) + else (* find the last physical block *) + let last_block, rf = Cluster.Map.max_binding !refs in + + if cluster >= last_block then + (moves, last_block) + else + let src = last_block and dst = cluster in + if Cluster.Map.mem src t.moves then + ( moves + , last_block + (* move already in progress, don't move it again *) + ) + else (* copy last_block into cluster and update rf *) + let move = {Move.src; dst} in + refs := + Cluster.Map.remove last_block + @@ Cluster.Map.add cluster rf !refs ; + (move :: moves, last_block) + ) + t.junk ([], max_cluster) + in + List.iter (fun move -> set_move_state t move Copying) moves ; + moves + +let is_immovable t cluster = cluster < t.first_movable_cluster + +let update_references t substitutions = + let refs = + Cluster.Map.fold + (fun to_c (from_c, from_w) acc -> + (* Has the cluster [from_c] been moved? *) + let from_c' = + try Cluster.Map.find from_c substitutions with Not_found -> from_c + in + if from_c <> from_c' then + Log.debug (fun f -> + f "Updating reference %s:%d -> %s to %s:%d -> %s" + (Cluster.to_string from_c) from_w (Cluster.to_string to_c) + (Cluster.to_string from_c') + from_w (Cluster.to_string to_c) + ) ; + Cluster.Map.add to_c (from_c', from_w) acc + ) + t.refs Cluster.Map.empty + in + t.refs <- refs diff --git a/ocaml/qcow-tool/lib/qcow_cluster_map.mli b/ocaml/qcow-tool/lib/qcow_cluster_map.mli new file mode 100644 index 00000000000..24af46361fb --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_cluster_map.mli @@ -0,0 +1,186 @@ +(* + * Copyright (C) 2016 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Qcow_types + +(** A cluster map which describes cluster usage in the file. The cluster map + tracks which clusters are free, and which are used, and where the references + are. *) +type t + +(** Describes the state of a block move *) +type move_state = + | Copying + (** a background copy is in progress. If this cluster is modified then + the copy should be aborted. *) + | Copied + (** contents of this cluster have been copied once to another cluster. + If this cluster is modified then the copy should be aborted. *) + | Flushed + (** contents of this cluster have been copied and flushed to disk: it + is now safe to rewrite the pointer. If this cluster is modified then + the copy should be aborted. *) + | Referenced + (** the reference has been rewritten; it is now safe to write to this + cluster again. On the next flush, the copy is complete and the original + block can be recycled. *) + +type reference = Cluster.t * int (* cluster * index within cluster *) + +module Move : sig + (** An instruction to move the contents from cluster [src] to cluster [dst] *) + type t = {src: Cluster.t; dst: Cluster.t} + + val to_string : t -> string +end + +(** describes the state of an in-progress block move *) +type move = {move: Move.t; state: move_state} + +val string_of_move : move -> string + +type cluster_state = + | Junk + | Erased + | Available + | Copies + | Roots (** The state of a cluster *) + +val set_cluster_state : + t -> Cluster.IntervalSet.t -> cluster_state -> cluster_state -> unit +(** Update the state of a cluster *) + +module type MutableSet = sig + val get : t -> Cluster.IntervalSet.t + (** [get t] query the current contents of the set *) + + val remove : t -> Cluster.IntervalSet.t -> unit + (** [remove t less] removes [less] from the set *) + + val mem : t -> Cluster.t -> bool + (** [mem t cluster] is true if [cluster] is in [t] *) +end + +val zero : t +(** A cluster map for a zero-length disk *) + +val make : + free:Qcow_bitmap.t + -> refs:reference Cluster.Map.t + -> cache:Qcow_cache.t + -> first_movable_cluster:Cluster.t + -> runtime_asserts:bool + -> id:string option + -> cluster_size:int + -> t +(** Given a set of free clusters, and the first cluster which can be moved + (i.e. that isn't fixed header), construct an empty cluster map. *) + +val total_used : t -> int64 +(** Return the number of tracked used clusters *) + +val total_free : t -> int64 +(** Return the number of tracked free clusters *) + +val resize : t -> Cluster.t -> unit +(** [resize t new_size_clusters] is called when the file is to be resized. *) + +val add : t -> reference -> Cluster.t -> unit +(** [add t ref cluster] marks [cluster] as in-use and notes the reference from + [reference]. *) + +val remove : t -> Cluster.t -> unit +(** [remove t cluster] marks [cluster] as free and invalidates any reference + to it (e.g. in response to a discard) *) + +(** Clusters which contain arbitrary data *) +module Junk : MutableSet + +(** Clusters which have been erased but haven't been flushed yet so can't be + safely reallocated. *) +module Erased : MutableSet + +(** Clusters which are available for reallocation *) +module Available : MutableSet + +(** Clusters which contain copies, as part of a compact *) +module Copies : MutableSet + +(** Clusters which have been allocated but not yet placed somewhere reachable + from the GC *) +module Roots : MutableSet + +val wait : t -> unit Lwt.t +(** [wait t] wait for some amount of recycling work to become available, e.g. + - junk could be created + - available could be used + - a move might require a reference update *) + +val start_moves : t -> Move.t list +(** [start_moves t] calculates the block moves required to compact [t] and + marks the clusters as moving *) + +val moves : t -> move Cluster.Map.t +(** [moves t] returns the state of the current active moves *) + +val set_move_state : t -> Move.t -> move_state -> unit +(** Update the state of the given move operation *) + +val is_moving : t -> Cluster.t -> bool +(** [is_moving t cluster] returns true if [cluster] is still moving *) + +val cancel_move : t -> Cluster.t -> unit +(** [cancel_move cluster] cancels any in-progress move of cluster [cluster]. + This should be called with the cluster write lock held whenever there has + been a change in the contents of [cluster] *) + +val complete_move : t -> Move.t -> unit +(** [complete_move t move] marks the move as complete. *) + +val find : t -> Cluster.t -> reference +(** [find t cluster] returns the reference to [cluster], or raises [Not_found] *) + +val get_last_block : t -> Cluster.t +(** [get_last_block t] is the last allocated block in [t]. Note if there are no + data blocks this will point to the last header block even though it is + immovable. *) + +val is_immovable : t -> Cluster.t -> bool +(** [is_immovable t cluster] is true if [cluster] is fixed and cannot be moved + i.e. it is before the first_movable_cluster i.e. it is part of the fixed + (L1) header structure. *) + +val update_references : t -> Cluster.t Cluster.Map.t -> unit +(** [update_references t subst] updates the reference table following the given set + of substitutions. Any reference to a source block must be updated to the + destination block otherwise it will be left pointing to junk. Normally this + is guaranteed by the Metadata.Physical.set function, but when compacting we + split the operation into phases and copy the block first at the byte level, + leaving the map out-of-sync *) + +val to_summary_string : t -> string +(** [to_summary_string t] returns a terse printable summary of [t] *) + +module Debug : sig + val assert_no_leaked_blocks : t -> unit + (** Check no blocks have gone missing *) + + val assert_equal : t -> t -> unit + (** Check that 2 maps have equivalent contents *) + + val metadata_blocks : t -> Cluster.IntervalSet.t + (** Return the set of blocks containing metadata *) +end diff --git a/ocaml/qcow-tool/lib/qcow_config.ml b/ocaml/qcow-tool/lib/qcow_config.ml new file mode 100644 index 00000000000..a31af4fb5c6 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_config.ml @@ -0,0 +1,109 @@ +(* + * Copyright (C) 2017 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +type t = { + id: string + ; discard: bool + ; keep_erased: int64 option + ; compact_after_unmaps: int64 option + ; check_on_connect: bool + ; runtime_asserts: bool + ; read_only: bool +} + +let fresh_id = + let id = ref 0 in + fun () -> + let result = "unknown_" ^ string_of_int !id in + incr id ; result + +let create ?(id = fresh_id ()) ?(discard = false) ?keep_erased + ?compact_after_unmaps ?(check_on_connect = true) ?(runtime_asserts = false) + ?(read_only = false) () = + { + id + ; discard + ; keep_erased + ; compact_after_unmaps + ; check_on_connect + ; runtime_asserts + ; read_only + } + +let to_string t = + Printf.sprintf + "id=%s;discard=%b;keep_erased=%scompact_after_unmaps=%s;check_on_connect=%b;runtime_asserts=%b;read_only=%b" + t.id t.discard + (match t.keep_erased with None -> "0" | Some x -> Int64.to_string x) + ( match t.compact_after_unmaps with + | None -> + "0" + | Some x -> + Int64.to_string x + ) + t.check_on_connect t.runtime_asserts t.read_only + +let default () = + { + id= fresh_id () + ; discard= false + ; keep_erased= None + ; compact_after_unmaps= None + ; check_on_connect= true + ; runtime_asserts= false + ; read_only= false + } + +let of_string txt = + let open Astring in + try + let strings = String.cuts ~sep:";" txt in + Ok + (List.fold_left + (fun t line -> + match String.cut ~sep:"=" line with + | None -> + t + | Some (k, v) -> ( + match String.Ascii.lowercase k with + | "id" -> + {t with id= v} + | "discard" -> + {t with discard= bool_of_string v} + | "keep_erased" -> + let keep_erased = + if v = "0" then None else Some (Int64.of_string v) + in + {t with keep_erased} + | "compact_after_unmaps" -> + let compact_after_unmaps = + if v = "0" then None else Some (Int64.of_string v) + in + {t with compact_after_unmaps} + | "check_on_connect" -> + {t with check_on_connect= bool_of_string v} + | "runtime_asserts" -> + {t with runtime_asserts= bool_of_string v} + | "read_only" -> + {t with read_only= bool_of_string v} + | x -> + failwith ("Unknown qcow configuration key: " ^ x) + ) + ) + (default ()) strings + ) + with e -> Error (`Msg (Printexc.to_string e)) diff --git a/ocaml/qcow-tool/lib/qcow_config.mli b/ocaml/qcow-tool/lib/qcow_config.mli new file mode 100644 index 00000000000..f192e2ff802 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_config.mli @@ -0,0 +1,48 @@ +(* + * Copyright (C) 2017 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +type t = { + id: string (** unique name for the prometheus metrics *) + ; discard: bool (** discard (aka TRIM) is enabled *) + ; keep_erased: int64 option + (** maintain a free pool of this many erased sectors *) + ; compact_after_unmaps: int64 option + (** once more than this many sectors are free, perform a compact *) + ; check_on_connect: bool (** perform an integrity check on connect *) + ; runtime_asserts: bool (** constantly verify GC invariants are held *) + ; read_only: bool (** guarantee to not modify the file *) +} + +val create : + ?id:string + -> ?discard:bool + -> ?keep_erased:int64 + -> ?compact_after_unmaps:int64 + -> ?check_on_connect:bool + -> ?runtime_asserts:bool + -> ?read_only:bool + -> unit + -> t + +val default : unit -> t +(** default configuration values *) + +val to_string : t -> string +(** convert the configuration to a string *) + +val of_string : string -> (t, [> `Msg of string]) result +(** parse the output of [to_string t] *) diff --git a/ocaml/qcow-tool/lib/qcow_cstructs.ml b/ocaml/qcow-tool/lib/qcow_cstructs.ml new file mode 100644 index 00000000000..572ffe43a8d --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_cstructs.ml @@ -0,0 +1,123 @@ +(* + * Copyright (C) 2017 Docker Inc + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +type t = Cstruct.t list + +let pp_t ppf t = + List.iter + (fun t -> + Format.fprintf ppf "[%d,%d](%d)" t.Cstruct.off t.Cstruct.len + (Bigarray.Array1.dim t.Cstruct.buffer) + ) + t + +let len = List.fold_left (fun acc c -> Cstruct.length c + acc) 0 + +let err fmt = + let b = Buffer.create 20 in + (* for thread safety. *) + let ppf = Format.formatter_of_buffer b in + let k ppf = + Format.pp_print_flush ppf () ; + invalid_arg (Buffer.contents b) + in + Format.kfprintf k ppf fmt + +let rec shift t x = + if x = 0 then + t + else + match t with + | [] -> + err "Cstructs.shift %a %d" pp_t t x + | y :: ys -> + let y' = Cstruct.length y in + if y' > x then + Cstruct.shift y x :: ys + else + shift ys (x - y') + +let to_string t = + let b = Buffer.create 20 in + List.iter (fun x -> Buffer.add_string b @@ Cstruct.to_string x) t ; + Buffer.contents b + +let sub t off len = + let t' = shift t off in + (* trim the length *) + let rec trim acc ts remaining = + match (remaining, ts) with + | 0, _ -> + List.rev acc + | _, [] -> + err "invalid bounds in Cstructs.sub %a off=%d len=%d" pp_t t off len + | n, t :: ts -> + let to_take = min (Cstruct.length t) n in + (* either t is consumed and we only need ts, or t has data remaining in which + case we're finished *) + trim (Cstruct.sub t 0 to_take :: acc) ts (remaining - to_take) + in + trim [] t' len + +let to_cstruct = function + | [common_case] -> + common_case + | uncommon_case -> + Cstruct.concat uncommon_case + +(* Return a Cstruct.t representing (off, len) by either returning a reference + or making a copy if the value is split across two fragments. Ideally this + would return a string rather than a Cstruct.t for efficiency *) +let get f t off len = + let t' = shift t off in + match t' with + | x :: xs -> + (* Return a reference to the existing buffer *) + if Cstruct.length x >= len then + Cstruct.sub x 0 len + else (* Copy into a fresh buffer *) + let rec copy remaining frags = + if Cstruct.length remaining > 0 then + match frags with + | [] -> + err "invalid bounds in Cstructs.%s %a off=%d len=%d" f pp_t t + off len + | x :: xs -> + let to_copy = + min (Cstruct.length x) (Cstruct.length remaining) + in + Cstruct.blit x 0 remaining 0 to_copy ; + (* either we've copied all of x, or we've filled the remaining buffer *) + copy (Cstruct.shift remaining to_copy) xs + in + let result = Cstruct.create len in + copy result (x :: xs) ; + result + | [] -> + err "invalid bounds in Cstructs.%s %a off=%d len=%d" f pp_t t off len + +let get_uint8 t off = Cstruct.get_uint8 (get "get_uint8" t off 1) 0 + +let memset ts x = List.iter (fun t -> Cstruct.memset t x) ts + +module BE = struct + open Cstruct.BE + + let get_uint16 t off = get_uint16 (get "get_uint16" t off 2) 0 + + let get_uint32 t off = get_uint32 (get "get_uint32" t off 4) 0 +end diff --git a/ocaml/qcow-tool/lib/qcow_cstructs.mli b/ocaml/qcow-tool/lib/qcow_cstructs.mli new file mode 100644 index 00000000000..76fb6301364 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_cstructs.mli @@ -0,0 +1,44 @@ +(* + * Copyright (C) 2017 Docker Inc + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +(** A subset of the Cstruct signature with type t = Cstruct.t list + + This should be replaced with another parser, perhaps angstrom? *) + +(** Data stored as a list of fragments *) +type t = Cstruct.t list + +val to_string : t -> string + +val shift : t -> int -> t + +val len : t -> int + +val sub : t -> int -> int -> t + +val get_uint8 : t -> int -> int + +val to_cstruct : t -> Cstruct.t +(** Returns a contiguous Cstruct.t, which may or may not involve a copy. *) + +val memset : t -> int -> unit + +module BE : sig + val get_uint16 : t -> int -> int + + val get_uint32 : t -> int -> int32 +end diff --git a/ocaml/qcow-tool/lib/qcow_debug.ml b/ocaml/qcow-tool/lib/qcow_debug.ml new file mode 100644 index 00000000000..33c951655db --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_debug.ml @@ -0,0 +1,89 @@ +(* + * Copyright (C) 2017 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +let src = + let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in + Logs.Src.set_level src (Some Logs.Info) ; + src + +module Log = (val Logs.src_log src : Logs.LOG) + +module Error = Qcow_error +module Physical = Qcow_physical +module Metadata = Qcow_metadata +open Qcow_types + +let check_on_disk_reference metadata ~cluster_bits (c, w) target = + Metadata.read metadata c (fun contents -> + let p = Metadata.Physical.of_contents contents in + let target' = Metadata.Physical.get p w in + let target_cluster = Physical.cluster ~cluster_bits target in + let target'_cluster = Physical.cluster ~cluster_bits target' in + let descr = + Printf.sprintf "Physical.get %s:%d = %s (%s %s)" (Cluster.to_string c) w + (Cluster.to_string target'_cluster) + (if target = target' then "=" else "<>") + (Cluster.to_string target_cluster) + in + if target <> target' then + Log.err (fun f -> f "%s" descr) + else + Log.info (fun f -> f "%s" descr) ; + Lwt.return (Ok ()) + ) + +let rec check_references metadata cluster_map ~cluster_bits (cluster : Cluster.t) + = + let open Error.Lwt_write_error.Infix in + match Qcow_cluster_map.find cluster_map cluster with + | exception Not_found -> + if Qcow_cluster_map.is_immovable cluster_map cluster then + Log.info (fun f -> + f "Cluster %s is an L1 cluster" (Cluster.to_string cluster) + ) + else + Log.err (fun f -> + f "No reference to cluster %s" (Cluster.to_string cluster) + ) ; + Lwt.return (Ok ()) + | c', w' -> + let target = + Physical.make ~is_mutable:true ~is_compressed:false + (Cluster.to_int cluster lsl cluster_bits) + in + check_on_disk_reference metadata ~cluster_bits (c', w') target + >>= fun () -> check_references metadata cluster_map ~cluster_bits c' + +let on_duplicate_reference metadata cluster_map ~cluster_bits (c, w) (c', w') + cluster = + let open Error.Lwt_write_error.Infix in + let cluster = Cluster.of_int64 cluster in + let rec follow (c, w) (cluster : Cluster.t) = + let target = + Physical.make ~is_mutable:true ~is_compressed:true + (Cluster.to_int cluster lsl cluster_bits) + in + check_on_disk_reference metadata ~cluster_bits (c, w) target >>= fun () -> + match Qcow_cluster_map.find cluster_map c with + | exception Not_found -> + Log.err (fun f -> f "No reference to cluster %s" (Cluster.to_string c)) ; + Lwt.return (Ok ()) + | c', w' -> + follow (c', w') c + in + follow (Cluster.of_int64 c', w') cluster >>= fun () -> + follow (Cluster.of_int64 c, w) cluster diff --git a/ocaml/qcow-tool/lib/qcow_debug.mli b/ocaml/qcow-tool/lib/qcow_debug.mli new file mode 100644 index 00000000000..3f011c73a4e --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_debug.mli @@ -0,0 +1,35 @@ +(* + * Copyright (C) 2017 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Qcow_types + +val on_duplicate_reference : + Qcow_metadata.t + -> Qcow_cluster_map.t + -> cluster_bits:int + -> int64 * int + -> int64 * int + -> int64 + -> (unit, [> `Disconnected | `Is_read_only | `Msg of string]) result Lwt.t + +val check_references : + Qcow_metadata.t + -> Qcow_cluster_map.t + -> cluster_bits:int + -> Cluster.t + -> (unit, [> `Disconnected | `Is_read_only | `Msg of string]) result Lwt.t +(** [check_references metadata map cluster_bits target] follows the back references + from physical offset [target], verifying the references on disk as it goes *) diff --git a/ocaml/qcow-tool/lib/qcow_diet.ml b/ocaml/qcow-tool/lib/qcow_diet.ml new file mode 100644 index 00000000000..60bc42083bf --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_diet.ml @@ -0,0 +1,629 @@ +(* + * Copyright (C) 2016 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +(* +#require "ppx_sexp_conv";; +#require "lwt";; +*) +open Sexplib.Std + +module type ELT = sig + type t [@@deriving sexp] + + val compare : t -> t -> int + + val zero : t + + val pred : t -> t + + val succ : t -> t + + val sub : t -> t -> t + + val add : t -> t -> t +end + +exception Interval_pairs_should_be_ordered of string + +exception Intervals_should_not_overlap of string + +exception Intervals_should_not_be_adjacent of string + +exception Height_not_equals_depth of string + +exception Unbalanced of string + +exception Cardinal of string + +let _ = + Printexc.register_printer (function + | Interval_pairs_should_be_ordered txt -> + Some ("Pairs within each interval should be ordered: " ^ txt) + | Intervals_should_not_overlap txt -> + Some ("Intervals should be ordered without overlap: " ^ txt) + | Intervals_should_not_be_adjacent txt -> + Some ("Intervals should not be adjacent: " ^ txt) + | Height_not_equals_depth txt -> + Some ("The height is not being maintained correctly: " ^ txt) + | Unbalanced txt -> + Some ("The tree has become imbalanced: " ^ txt) + | Cardinal txt -> + Some ("The cardinal value stored in the node is wrong: " ^ txt) + | _ -> + None + ) + +module Make (Elt : ELT) = struct + type elt = Elt.t [@@deriving sexp] + + module Elt = struct + include Elt + + let ( - ) = sub + + let ( + ) = add + end + + type interval = elt * elt + + module Interval = struct + let make x y = + if x > y then invalid_arg "Interval.make" ; + (x, y) + + let x = fst + + let y = snd + end + + let ( > ) x y = Elt.compare x y > 0 + + let ( >= ) x y = Elt.compare x y >= 0 + + let ( < ) x y = Elt.compare x y < 0 + + let ( <= ) x y = Elt.compare x y <= 0 + + let eq x y = Elt.compare x y = 0 + + let succ, pred = (Elt.succ, Elt.pred) + + type t = Empty | Node : node -> t + + and node = {x: elt; y: elt; l: t; r: t; h: int; cardinal: elt} + [@@deriving sexp] + + let height = function Empty -> 0 | Node n -> n.h + + let cardinal = function Empty -> Elt.zero | Node n -> n.cardinal + + let create x y l r = + let h = max (height l) (height r) + 1 in + let cardinal = Elt.(succ (y - x) + cardinal l + cardinal r) in + Node {x; y; l; r; h; cardinal} + + let rec node x y l r = + let hl = height l and hr = height r in + let open Stdlib in + if hl > hr + 2 then + match l with + | Empty -> + assert false + | Node {x= lx; y= ly; l= ll; r= lr; _} -> ( + if height ll >= height lr then + node lx ly ll (node x y lr r) + else + match lr with + | Empty -> + assert false + | Node {x= lrx; y= lry; l= lrl; r= lrr; _} -> + node lrx lry (node lx ly ll lrl) (node x y lrr r) + ) + else if hr > hl + 2 then + match r with + | Empty -> + assert false + | Node {x= rx; y= ry; l= rl; r= rr; _} -> ( + if height rr >= height rl then + node rx ry (node x y l rl) rr + else + match rl with + | Empty -> + assert false + | Node {x= rlx; y= rly; l= rll; r= rlr; _} -> + node rlx rly (node x y l rll) (node rx ry rlr rr) + ) + else + create x y l r + + let depth tree = + let rec depth tree k = + match tree with + | Empty -> + k 0 + | Node n -> + depth n.l (fun dl -> depth n.r (fun dr -> k (1 + max dl dr))) + in + depth tree (fun d -> d) + + let to_string_internal t = Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_t t + + module Invariant = struct + (* The pairs (x, y) in each interval are ordered such that x <= y *) + let rec ordered t = + match t with + | Empty -> + () + | Node {x; y; l; r; _} -> + if x > y then + raise (Interval_pairs_should_be_ordered (to_string_internal t)) ; + ordered l ; + ordered r + + (* The intervals don't overlap *) + let rec no_overlap t = + match t with + | Empty -> + () + | Node {x; y; l; r; _} -> + ( match l with + | Empty -> + () + | Node left -> + if left.y >= x then + raise (Intervals_should_not_overlap (to_string_internal t)) + ) ; + ( match r with + | Empty -> + () + | Node right -> + if right.x <= y then + raise (Intervals_should_not_overlap (to_string_internal t)) + ) ; + no_overlap l ; no_overlap r + + let rec no_adjacent t = + let biggest = function Empty -> None | Node {y; _} -> Some y in + let smallest = function Empty -> None | Node {x; _} -> Some x in + match t with + | Empty -> + () + | Node {x; y; l; r; _} -> + ( match biggest l with + | Some ly when Elt.succ ly >= x -> + raise (Intervals_should_not_be_adjacent (to_string_internal t)) + | _ -> + () + ) ; + ( match smallest r with + | Some rx when Elt.pred rx <= y -> + raise (Intervals_should_not_be_adjacent (to_string_internal t)) + | _ -> + () + ) ; + no_adjacent l ; no_adjacent r + + (* The height is being stored correctly *) + let rec height_equals_depth t = + if height t <> depth t then + raise (Height_not_equals_depth (to_string_internal t)) ; + match t with + | Empty -> + () + | Node {l; r; _} -> + height_equals_depth l ; height_equals_depth r + + let rec balanced = function + | Empty -> + () + | Node {l; r; _} as t -> + let diff = height l - height r in + let open Stdlib in + if diff > 2 || diff < -2 then ( + Printf.fprintf stdout "height l = %d = %s\n" (height l) + (to_string_internal l) ; + Printf.fprintf stdout "height r = %d = %s\n" (height r) + (to_string_internal r) ; + raise (Unbalanced (to_string_internal t)) + ) ; + balanced l ; + balanced r + + let rec check_cardinal = function + | Empty -> + () + | Node {x; y; l; r; cardinal= c; _} as t -> + check_cardinal l ; + check_cardinal r ; + if Elt.(c - cardinal l - cardinal r - y + x) <> Elt.(succ zero) then + raise (Cardinal (to_string_internal t)) + + let check t = + ordered t ; + no_overlap t ; + height_equals_depth t ; + balanced t ; + check_cardinal t ; + no_adjacent t + end + + let empty = Empty + + let is_empty = function Empty -> true | _ -> false + + let rec mem elt = function + | Empty -> + false + | Node n -> + (* consider this interval *) + (elt >= n.x && elt <= n.y) + || + (* or search left or search right *) + if elt < n.x then mem elt n.l else mem elt n.r + + let rec min_elt = function + | Empty -> + raise Not_found + | Node {x; y; l= Empty; _} -> + (x, y) + | Node {l; _} -> + min_elt l + + let rec max_elt = function + | Empty -> + raise Not_found + | Node {x; y; r= Empty; _} -> + (x, y) + | Node {r; _} -> + max_elt r + + let choose = function Empty -> raise Not_found | Node {x; y; _} -> (x, y) + + (* fold over the maximal contiguous intervals *) + let rec fold f t acc = + match t with + | Empty -> + acc + | Node n -> + let acc = fold f n.l acc in + let acc = f (n.x, n.y) acc in + fold f n.r acc + + let rec fold_s f t acc = + match t with + | Empty -> + Lwt.return acc + | Node n -> + let open Lwt.Infix in + fold_s f n.l acc >>= fun acc -> + f (n.x, n.y) acc >>= fun acc -> fold_s f n.r acc + + (* fold over individual elements *) + let fold_individual f t acc = + let range (from, upto) acc = + let rec loop acc x = + if eq x (succ upto) then acc else loop (f x acc) (succ x) + in + loop acc from + in + fold range t acc + + let elements t = fold_individual (fun x acc -> x :: acc) t [] |> List.rev + + (* return (x, y, l) where (x, y) is the maximal interval and [l] is + the rest of the tree on the left (whose intervals are all smaller). *) + let rec splitMax = function + | {x; y; l; r= Empty; _} -> + (x, y, l) + | {r= Node r; _} as n -> + let u, v, r' = splitMax r in + (u, v, node n.x n.y n.l r') + + (* return (x, y, r) where (x, y) is the minimal interval and [r] is + the rest of the tree on the right (whose intervals are all larger) *) + let rec splitMin = function + | {x; y; l= Empty; r; _} -> + (x, y, r) + | {l= Node l; _} as n -> + let u, v, l' = splitMin l in + (u, v, node n.x n.y l' n.r) + + let addL = function + | {l= Empty; _} as n -> + n + | {l= Node l; _} as n -> + (* we might have to merge the new element with the maximal interval from + the left *) + let x', y', l' = splitMax l in + if eq (succ y') n.x then {n with x= x'; l= l'} else n + + let addR = function + | {r= Empty; _} as n -> + n + | {r= Node r; _} as n -> + (* we might have to merge the new element with the minimal interval on + the right *) + let x', y', r' = splitMin r in + if eq (succ n.y) x' then {n with y= y'; r= r'} else n + + let rec add (x, y) t = + if y < x then invalid_arg "interval reversed" ; + match t with + | Empty -> + node x y Empty Empty + (* completely to the left *) + | Node n when y < Elt.pred n.x -> + let l = add (x, y) n.l in + node n.x n.y l n.r + (* completely to the right *) + | Node n when Elt.succ n.y < x -> + let r = add (x, y) n.r in + node n.x n.y n.l r + (* overlap on the left only *) + | Node n when x < n.x && y <= n.y -> + let l = add (x, pred n.x) n.l in + let n = addL {n with l} in + node n.x n.y n.l n.r + (* overlap on the right only *) + | Node n when y > n.y && x >= n.x -> + let r = add (succ n.y, y) n.r in + let n = addR {n with r} in + node n.x n.y n.l n.r + (* overlap on both sides *) + | Node n when x < n.x && y > n.y -> + let l = add (x, pred n.x) n.l in + let r = add (succ n.y, y) n.r in + let n = addL {(addR {n with r}) with l} in + node n.x n.y n.l n.r + (* completely within *) + | Node n -> + Node n + + let union a b = + let a' = cardinal a and b' = cardinal b in + if a' > b' then + fold add b a + else + fold add a b + + let merge l r = + match (l, r) with + | l, Empty -> + l + | Empty, r -> + r + | Node l, r -> + let x, y, l' = splitMax l in + node x y l' r + + let rec remove (x, y) t = + if y < x then invalid_arg "interval reversed" ; + match t with + | Empty -> + Empty + (* completely to the left *) + | Node n when y < n.x -> + let l = remove (x, y) n.l in + node n.x n.y l n.r + (* completely to the right *) + | Node n when n.y < x -> + let r = remove (x, y) n.r in + node n.x n.y n.l r + (* overlap on the left only *) + | Node n when x < n.x && y < n.y -> + let n' = node (succ y) n.y n.l n.r in + remove (x, pred n.x) n' + (* overlap on the right only *) + | Node n when y > n.y && x > n.x -> + let n' = node n.x (pred x) n.l n.r in + remove (succ n.y, y) n' + (* overlap on both sides *) + | Node n when x <= n.x && y >= n.y -> + let l = remove (x, n.x) n.l in + let r = remove (n.y, y) n.r in + merge l r + (* completely within *) + | Node n when eq y n.y -> + node n.x (pred x) n.l n.r + | Node n when eq x n.x -> + node (succ y) n.y n.l n.r + | Node n -> + assert (n.x <= pred x) ; + assert (succ y <= n.y) ; + let r = node (succ y) n.y Empty n.r in + node n.x (pred x) n.l r + + let diff a b = fold remove b a + + let inter a b = diff a (diff a b) + + let take t n = + let rec loop acc free n = + if n = Elt.zero then + Some (acc, free) + else + match + try + let i = choose free in + let x, y = Interval.(x i, y i) in + let len = Elt.(succ @@ (y - x)) in + let will_use = if Stdlib.(Elt.compare n len < 0) then n else len in + let i' = Interval.make x Elt.(pred @@ (x + will_use)) in + Some (add i' acc, remove i' free, Elt.(n - will_use)) + with Not_found -> None + with + | Some (acc', free', n') -> + loop acc' free' n' + | None -> + None + in + loop empty t n +end + +module Int = struct + type t = int [@@deriving sexp] + + let compare (x : t) (y : t) = Stdlib.compare x y + + let zero = 0 + + let succ x = x + 1 + + let pred x = x - 1 + + let add x y = x + y + + let sub x y = x - y +end + +module IntDiet = Make (Int) +module IntSet = Set.Make (Int) + +module Test = struct + let check_depth n = + let init = IntDiet.add (IntDiet.Interval.make 0 n) IntDiet.empty in + (* take away every other block *) + let rec sub m acc = + (* Printf.printf "acc = %s\n%!" (IntDiet.to_string_internal acc); *) + if m <= 0 then + acc + else + sub (m - 2) IntDiet.(remove (Interval.make m m) acc) + in + let set = sub n init in + let d = IntDiet.height set in + if d > int_of_float (log (float_of_int n) /. log 2.) + 1 then + failwith "Depth larger than expected" ; + let set = sub (n - 1) set in + let d = IntDiet.height set in + assert (d == 1) + + let make_random n m = + let rec loop set diet = function + | 0 -> + (set, diet) + | m -> + let r = Random.int n in + let r' = Random.int (n - r) + r in + let add = Random.bool () in + let rec range from upto = + if from > upto then [] else from :: range (from + 1) upto + in + let set = + List.fold_left + (fun set elt -> + (if add then IntSet.add else IntSet.remove) elt set + ) + set (range r r') + in + let diet' = + (if add then IntDiet.add else IntDiet.remove) + (IntDiet.Interval.make r r') + diet + in + ( try IntDiet.Invariant.check diet' + with e -> + Printf.fprintf stderr "%s %d\nBefore: %s\nAfter: %s\n" + (if add then "Add" else "Remove") + r + (IntDiet.to_string_internal diet) + (IntDiet.to_string_internal diet') ; + raise e + ) ; + loop set diet' (m - 1) + in + loop IntSet.empty IntDiet.empty m + (* + let set_to_string set = + String.concat "; " @@ List.map string_of_int @@ IntSet.elements set + let diet_to_string diet = + String.concat "; " @@ List.map string_of_int @@ IntDiet.elements diet + *) + + let check_equals set diet = + let set' = IntSet.elements set in + let diet' = IntDiet.elements diet in + if set' <> diet' then + (* + Printf.fprintf stderr "Set contains: [ %s ]\n" @@ set_to_string set; + Printf.fprintf stderr "Diet contains: [ %s ]\n" @@ diet_to_string diet; + *) + failwith "check_equals" + + let test_adds () = + for _ = 1 to 100 do + let set, diet = make_random 1000 1000 in + ( try IntDiet.Invariant.check diet + with e -> + (* + Printf.fprintf stderr "Diet contains: [ %s ]\n" @@ IntDiet.to_string_internal diet; + *) + raise e + ) ; + check_equals set diet + done + + let test_operator set_op diet_op () = + for _ = 1 to 100 do + let set1, diet1 = make_random 1000 1000 in + let set2, diet2 = make_random 1000 1000 in + check_equals set1 diet1 ; + check_equals set2 diet2 ; + let set3 = set_op set1 set2 in + let diet3 = diet_op diet1 diet2 in + (* + Printf.fprintf stderr "diet1 = %s\n" (IntDiet.to_string_internal diet1); + Printf.fprintf stderr "diet3 = %s\n" (IntDiet.to_string_internal diet2); + Printf.fprintf stderr "diet2 = %s\n" (IntDiet.to_string_internal diet3); + *) + check_equals set3 diet3 + done + + let test_add_1 () = + let open IntDiet in + assert (elements @@ add (3, 4) @@ add (3, 3) empty = [3; 4]) + + let test_remove_1 () = + let open IntDiet in + assert (elements @@ remove (6, 7) @@ add (7, 8) empty = [8]) + + let test_remove_2 () = + let open IntDiet in + assert ( + elements @@ diff (add (9, 9) @@ add (5, 7) empty) (add (7, 9) empty) + = [5; 6] + ) + + let test_adjacent_1 () = + let open IntDiet in + let set = add (9, 9) @@ add (8, 8) empty in + IntDiet.Invariant.check set + + let test_depth () = check_depth 1048576 + + let all = + [ + ("adding an element to the right", test_add_1) + ; ("removing an element on the left", test_remove_1) + ; ("removing an elements from two intervals", test_remove_2) + ; ("test adjacent intervals are coalesced", test_adjacent_1) + ; ("logarithmic depth", test_depth) + ; ("adding and removing elements acts like a Set", test_adds) + ; ("union", test_operator IntSet.union IntDiet.union) + ; ("diff", test_operator IntSet.diff IntDiet.diff) + ; ("intersection", test_operator IntSet.inter IntDiet.inter) + ] +end diff --git a/ocaml/qcow-tool/lib/qcow_diet.mli b/ocaml/qcow-tool/lib/qcow_diet.mli new file mode 100644 index 00000000000..9ad96b1831f --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_diet.mli @@ -0,0 +1,44 @@ +(* + * Copyright (C) 2016 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +module type ELT = sig + (** The type of the set elements. *) + type t [@@deriving sexp] + + include Set.OrderedType with type t := t + + val zero : t + (** The zeroth element *) + + val pred : t -> t + (** Predecessor of an element *) + + val succ : t -> t + (** Successor of an element *) + + val sub : t -> t -> t + (** [sub a b] returns [a] - [b] *) + + val add : t -> t -> t + (** [add a b] returns [a] + [b] *) +end + +module Make (Elt : ELT) : Qcow_s.INTERVAL_SET with type elt = Elt.t + +module Test : sig + val all : (string * (unit -> unit)) list +end diff --git a/ocaml/qcow-tool/lib/qcow_error.ml b/ocaml/qcow-tool/lib/qcow_error.ml new file mode 100644 index 00000000000..a1752aa773e --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_error.ml @@ -0,0 +1,113 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Result + +type error = [`Msg of string] + +type 'a t = ('a, error) result + +let return x = Ok x + +let error_msg fmt = Printf.ksprintf (fun s -> Error (`Msg s)) fmt + +let ( >>= ) m f = match m with Error x -> Error x | Ok x -> f x + +let rec any = function + | [] -> + Ok () + | Error e :: _ -> + Error e + | _ :: rest -> + any rest + +module Lwt_error = struct + open Lwt.Infix + + module Infix = struct + let ( >>= ) m f = + m >>= function + | Ok x -> + f x + | Error (`Msg s) -> + Lwt.return (Error (`Msg s)) + | Error `Disconnected -> + Lwt.return (Error `Disconnected) + end + + let or_fail_with m = + let open Lwt in + m >>= function + | Error (`Msg s) -> + Lwt.fail_with s + | Error `Disconnected -> + Lwt.fail_with "disconnected" + | Ok x -> + Lwt.return x + + module List = struct + let map_p f xs = + let threads = List.map f xs in + Lwt_list.fold_left_s + (fun acc t -> + t >>= fun x -> + match (acc, x) with + | Error e, _ -> + Lwt.return (Error e) + | _, Error e -> + Lwt.return (Error e) + | Ok acc, Ok x -> + Lwt.return (Ok (x :: acc)) + ) + (Ok []) threads + >>= function + | Error e -> + Lwt.return (Error e) + | Ok xs -> + Lwt.return (Ok (List.rev xs)) + end +end + +module Lwt_write_error = struct + module Infix = struct + open Lwt.Infix + + let ( >>= ) m f = + m >>= function + | Ok x -> + f x + | Error (`Msg s) -> + Lwt.return (Error (`Msg s)) + | Error `Is_read_only -> + Lwt.return (Error `Is_read_only) + | Error `Disconnected -> + Lwt.return (Error `Disconnected) + end + + let or_fail_with m = + let open Lwt in + m >>= function + | Error (`Msg s) -> + Lwt.fail_with s + | Error `Is_read_only -> + Lwt.fail_with "is read only" + | Error `Disconnected -> + Lwt.fail_with "disconnected" + | Ok x -> + Lwt.return x +end + +exception Duplicate_reference of (int64 * int) * (int64 * int) * int64 diff --git a/ocaml/qcow-tool/lib/qcow_error.mli b/ocaml/qcow-tool/lib/qcow_error.mli new file mode 100644 index 00000000000..013166725aa --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_error.mli @@ -0,0 +1,74 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +(** Common error reporting functions *) + +open Result + +type error = + [`Msg of string (** A fatal error condition; the string should be logged *)] + +type 'a t = ('a, error) result + +val return : 'a -> ('a, error) result + +val error_msg : + ('a, unit, string, ('b, [> `Msg of string]) result) format4 -> 'a + +val ( >>= ) : ('a, 'b) result -> ('a -> ('c, 'b) result) -> ('c, 'b) result + +val any : (unit, 'b) result list -> (unit, 'b) result + +module Lwt_error : sig + module Infix : sig + val ( >>= ) : + ('a, [< `Disconnected | `Msg of 'b]) result Lwt.t + -> ('a -> ('c, ([> `Disconnected | `Msg of 'b] as 'd)) result Lwt.t) + -> ('c, 'd) result Lwt.t + end + + val or_fail_with : + ('a, [< `Disconnected | `Msg of string]) result Lwt.t -> 'a Lwt.t + + module List : sig + val map_p : + ('a -> ('b, 'error) result Lwt.t) + -> 'a list + -> ('b list, 'error) result Lwt.t + (** [map_p f xs] computes [f x] where [x \in xs] concurrently and returns + a list of successful results or the first error encountered. All threads + will have terminated by the time the function returns. *) + end +end + +module Lwt_write_error : sig + module Infix : sig + val ( >>= ) : + ('a, [< `Disconnected | `Is_read_only | `Msg of 'b]) result Lwt.t + -> ( 'a + -> ('c, ([> `Disconnected | `Is_read_only | `Msg of 'b] as 'd)) result + Lwt.t + ) + -> ('c, 'd) result Lwt.t + end + + val or_fail_with : + ('a, [< `Disconnected | `Is_read_only | `Msg of string]) result Lwt.t + -> 'a Lwt.t +end + +exception Duplicate_reference of (int64 * int) * (int64 * int) * int64 diff --git a/ocaml/qcow-tool/lib/qcow_header.ml b/ocaml/qcow-tool/lib/qcow_header.ml new file mode 100644 index 00000000000..e58f497c9d6 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_header.ml @@ -0,0 +1,463 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Sexplib.Std +open Astring +open Result +open Qcow_error +module OldInt64 = Int64 +open Qcow_types +module Physical = Qcow_physical + +let ( <| ) = OldInt64.shift_left + +let ( |> ) = OldInt64.shift_right_logical + +module Version = struct + type t = [`One | `Two | `Three] [@@deriving sexp] + + let sizeof _ = 4 + + let write t rest = + Int32.write (match t with `One -> 1l | `Two -> 2l | `Three -> 3l) rest + + let read rest = + Int32.read rest >>= fun (version, rest) -> + match version with + | 1l -> + return (`One, rest) + | 2l -> + return (`Two, rest) + | 3l -> + return (`Three, rest) + | _ -> + error_msg "Unknown version: %ld" version + + let compare (a : t) (b : t) = Stdlib.compare a b +end + +module CryptMethod = struct + type t = [`Aes | `None] [@@deriving sexp] + + let sizeof _ = 4 + + let write t rest = Int32.write (match t with `Aes -> 1l | `None -> 0l) rest + + let read rest = + Int32.read rest >>= fun (m, rest) -> + match m with + | 0l -> + return (`None, rest) + | 1l -> + return (`Aes, rest) + | _ -> + error_msg "Unknown crypt_method: %ld" m + + let compare (a : t) (b : t) = Stdlib.compare a b +end + +module Feature = struct + type ty = [`Incompatible | `Compatible | `Autoclear] [@@deriving sexp] + + type feature = [`Corrupt | `Dirty | `Lazy_refcounts | `Unknown of string] + [@@deriving sexp] + + type t = {ty: ty; bit: int; feature: feature} [@@deriving sexp] + + let understood = + [ + {ty= `Incompatible; bit= 0; feature= `Dirty} + ; {ty= `Incompatible; bit= 1; feature= `Corrupt} + ; {ty= `Compatible; bit= 0; feature= `Lazy_refcounts} + ] + + let sizeof _ = 48 + + let write t rest = + Int8.write + (match t.ty with `Incompatible -> 0 | `Compatible -> 1 | `Autoclear -> 2) + rest + >>= fun rest -> + Int8.write t.bit rest >>= fun rest -> + let str = + match t.feature with + | `Corrupt -> + "corrupt bit" + | `Dirty -> + "dirty bit" + | `Lazy_refcounts -> + "lazy refcounts" + | `Unknown x -> + x + in + Cstruct.(memset (sub rest 0 46) 0) ; + Cstruct.blit_from_string str 0 rest 0 (String.length str) ; + Result.Ok (Cstruct.shift rest 46) + + let read rest = + Int8.read rest >>= fun (ty, rest) -> + ( match ty with + | 0 -> + Ok `Incompatible + | 1 -> + Ok `Compatible + | 2 -> + Ok `Autoclear + | n -> + error_msg "Unknown header extension type %d" n + ) + >>= fun ty -> + Int8.read rest >>= fun (bit, rest) -> + let feature = + String.trim ~drop:(fun c -> c = '\000') Cstruct.(to_string (sub rest 0 46)) + in + let feature = + match feature with + | "corrupt bit" -> + `Corrupt + | "dirty bit" -> + `Dirty + | "lazy refcounts" -> + `Lazy_refcounts + | x -> + `Unknown x + in + Ok ({ty; bit; feature}, Cstruct.shift rest 46) + + let read_all rest = + let rec loop acc rest = + if Cstruct.length rest = 0 then + Ok (List.rev acc) + else if Cstruct.length rest < 48 then + error_msg "Trailing garbage in feature area: %s" + (String.Ascii.escape (Cstruct.to_string rest)) + else + read rest >>= fun (first, rest) -> loop (first :: acc) rest + in + loop [] rest + + let write_all ts rest = + let rec loop rest = function + | [] -> + Ok rest + | t :: ts -> + write t rest >>= fun rest -> loop rest ts + in + loop rest ts +end + +type offset = int64 [@@deriving sexp] + +type extension = + [ `Unknown of int32 * string + | `Backing_file of string + | `Feature_name_table of Feature.t list ] +[@@deriving sexp] + +type additional = { + dirty: bool + ; corrupt: bool + ; lazy_refcounts: bool + ; autoclear_features: int64 + ; refcount_order: int32 +} +[@@deriving sexp] + +type t = { + version: Version.t + ; backing_file_offset: offset + ; backing_file_size: int32 + ; cluster_bits: int32 + ; size: int64 + ; crypt_method: CryptMethod.t + ; l1_size: int32 + ; l1_table_offset: Physical.t + ; refcount_table_offset: Physical.t + ; refcount_table_clusters: int32 + ; nb_snapshots: int32 + ; snapshots_offset: offset + ; additional: additional option + ; extensions: extension list +} +[@@deriving sexp] + +let compare (a : t) (b : t) = Stdlib.compare a b + +let to_string t = Sexplib.Sexp.to_string_hum (sexp_of_t t) + +let sizeof t = + let base = 4 + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 8 + 8 + 4 + 4 + 8 in + let additional = + match t.additional with None -> 0 | Some _ -> 8 + 8 + 8 + 4 + 4 + in + let unpadded_sizeof_extension = function + | `Unknown (_, data) -> + String.length data + | `Backing_file data -> + String.length data + | `Feature_name_table features -> + List.fold_left ( + ) (4 + 4) (List.map Feature.sizeof features) + in + let pad_to_8 x = if x mod 8 = 0 then x else x + (8 - (x mod 8)) in + let extensions = + List.( + fold_left ( + ) 0 + (map (fun x -> pad_to_8 @@ unpadded_sizeof_extension x) t.extensions) + ) + in + base + additional + extensions + +let write t rest = + let initial_buffer_length = Cstruct.length rest in + big_enough_for "Header" rest (sizeof t) >>= fun () -> + Int8.write (int_of_char 'Q') rest >>= fun rest -> + Int8.write (int_of_char 'F') rest >>= fun rest -> + Int8.write (int_of_char 'I') rest >>= fun rest -> + Int8.write 0xfb rest >>= fun rest -> + Version.write t.version rest >>= fun rest -> + Int64.write t.backing_file_offset rest >>= fun rest -> + Int32.write t.backing_file_size rest >>= fun rest -> + Int32.write t.cluster_bits rest >>= fun rest -> + Int64.write t.size rest >>= fun rest -> + CryptMethod.write t.crypt_method rest >>= fun rest -> + Int32.write t.l1_size rest >>= fun rest -> + Int64.write (Int64.of_int @@ Physical.to_bytes t.l1_table_offset) rest + >>= fun rest -> + Int64.write (Int64.of_int @@ Physical.to_bytes t.refcount_table_offset) rest + >>= fun rest -> + Int32.write t.refcount_table_clusters rest >>= fun rest -> + Int32.write t.nb_snapshots rest >>= fun rest -> + Int64.write t.snapshots_offset rest >>= fun rest -> + match t.additional with + | None -> + return rest + | Some e -> + let incompatible_features = + let bits = + [ + (if e.dirty then 1L <| 0 else 0L) + ; (if e.corrupt then 1L <| 1 else 0L) + ] + in + List.fold_left Int64.logor 0L bits + in + Int64.write incompatible_features rest >>= fun rest -> + let compatible_features = + let bits = [(if e.lazy_refcounts then 1L <| 0 else 0L)] in + List.fold_left Int64.logor 0L bits + in + Int64.write compatible_features rest >>= fun rest -> + Int64.write e.autoclear_features rest >>= fun rest -> + Int32.write e.refcount_order rest >>= fun rest -> + (* The extensions are not counted in the header_length *) + let header_length = + Int32.of_int (4 + initial_buffer_length - Cstruct.length rest) + in + Int32.write header_length rest >>= fun rest -> + let write_extension rest = function + | `Unknown (kind, data) -> + Int32.write kind rest >>= fun rest -> + let length = String.length data in + Int32.write (Int32.of_int length) rest >>= fun rest -> + Cstruct.blit_from_string data 0 rest 0 length ; + Ok (Cstruct.shift rest (String.length data)) + | `Backing_file filename -> + Int32.write 0xE2792ACAl rest >>= fun rest -> + let length = String.length filename in + Int32.write (Int32.of_int length) rest >>= fun rest -> + Cstruct.blit_from_string filename 0 rest 0 length ; + Ok (Cstruct.shift rest (String.length filename)) + | `Feature_name_table fs -> + let length = List.fold_left ( + ) 0 (List.map Feature.sizeof fs) in + Int32.write 0x6803f857l rest >>= fun rest -> + Int32.write (Int32.of_int length) rest >>= fun rest -> + Feature.write_all fs rest + in + let rec loop rest = function + | [] -> + Int32.write 0l rest + | e :: es -> + write_extension rest e >>= fun rest -> loop rest es + in + loop rest t.extensions + +let read rest = + Int8.read rest >>= fun (x, rest) -> + ( if char_of_int x = 'Q' then + return rest + else + error_msg "Expected magic: got %02x" x + ) + >>= fun rest -> + Int8.read rest >>= fun (x, rest) -> + ( if char_of_int x = 'F' then + return rest + else + error_msg "Expected magic: got %02x" x + ) + >>= fun rest -> + Int8.read rest >>= fun (x, rest) -> + ( if char_of_int x = 'I' then + return rest + else + error_msg "Expected magic: got %02x" x + ) + >>= fun rest -> + Int8.read rest >>= fun (x, rest) -> + ( if x = 0xfb then + return rest + else + error_msg "Expected magic: got %02x" x + ) + >>= fun rest -> + Version.read rest >>= fun (version, rest) -> + Int64.read rest >>= fun (backing_file_offset, rest) -> + Int32.read rest >>= fun (backing_file_size, rest) -> + Int32.read rest >>= fun (cluster_bits, rest) -> + Int64.read rest >>= fun (size, rest) -> + CryptMethod.read rest >>= fun (crypt_method, rest) -> + Int32.read rest >>= fun (l1_size, rest) -> + let l1_table_offset = Physical.read rest in + let rest = Cstruct.shift rest 8 in + let refcount_table_offset = Physical.read rest in + let rest = Cstruct.shift rest 8 in + Int32.read rest >>= fun (refcount_table_clusters, rest) -> + Int32.read rest >>= fun (nb_snapshots, rest) -> + Int64.read rest >>= fun (snapshots_offset, rest) -> + ( match version with + | `One | `Two -> + return (None, [], 72, rest) + | _ -> + Int64.read rest >>= fun (incompatible_features, rest) -> + let dirty = Int64.logand 1L (incompatible_features |> 0) = 1L in + let corrupt = Int64.logand 1L (incompatible_features |> 1) = 1L in + ( if incompatible_features |> 2 <> 0L then + error_msg "unknown incompatible_features set: 0x%Lx" + incompatible_features + else + return () + ) + >>= fun () -> + Int64.read rest >>= fun (compatible_features, rest) -> + let lazy_refcounts = Int64.logand 1L (compatible_features |> 0) = 1L in + Int64.read rest >>= fun (autoclear_features, rest) -> + ( if autoclear_features <> 0L then + error_msg "dealing with autoclear_features not implemented" + else + return () + ) + >>= fun () -> + Int32.read rest >>= fun (refcount_order, rest) -> + Int32.read rest >>= fun (header_length, rest) -> + let rec read_lowlevel rest = + Int32.read rest >>= fun (kind, rest) -> + if kind = 0l then + return ([], rest) + else + Int32.read rest >>= fun (len, rest) -> + let len = Int32.to_int len in + let payload = Cstruct.sub rest 0 len in + let rest = Cstruct.shift rest len in + let padding_length = if len mod 8 = 0 then 0 else 8 - (len mod 8) in + let rest = Cstruct.shift rest padding_length in + read_lowlevel rest >>= fun (extensions, rest) -> + return ((kind, payload) :: extensions, rest) + in + let parse_extension (kind, payload) = + match kind with + | 0xE2792ACAl -> + Ok (`Backing_file (Cstruct.to_string payload)) + | 0x6803f857l -> + Feature.read_all payload >>= fun features -> + Ok (`Feature_name_table features) + | _ -> + Ok (`Unknown (kind, Cstruct.to_string payload)) + in + read_lowlevel rest >>= fun (e, rest) -> + List.fold_left + (fun acc x -> + acc >>= fun acc -> + parse_extension x >>= fun extension -> Ok (extension :: acc) + ) + (Ok []) e + >>= fun extensions -> + let header_length = Int32.to_int header_length in + return + ( Some + {dirty; corrupt; lazy_refcounts; autoclear_features; refcount_order} + , extensions + , header_length + , rest + ) + ) + >>= fun (additional, extensions, header_length, rest) -> + let t = + { + version + ; backing_file_offset + ; backing_file_size + ; cluster_bits + ; size + ; crypt_method + ; l1_size + ; l1_table_offset + ; refcount_table_offset + ; refcount_table_clusters + ; nb_snapshots + ; snapshots_offset + ; additional + ; extensions + } + in + (* qemu excludes extensions from the header_length *) + if sizeof {t with extensions= []} <> header_length then + error_msg "Read a header_length of %d but we computed %d" header_length + (sizeof t) + else + return (t, rest) + +let refcounts_per_cluster t = + let cluster_bits = Int32.to_int t.cluster_bits in + let cluster_size = 1L <| cluster_bits in + (* Each reference count is 2 bytes long *) + OldInt64.div cluster_size 2L + +let max_refcount_table_size t = + let cluster_bits = Int32.to_int t.cluster_bits in + let size = t.size in + let cluster_size = 1L <| cluster_bits in + let refs_per_cluster = refcounts_per_cluster t in + let size_in_clusters = + OldInt64.div (Int64.round_up size cluster_size) cluster_size + in + let refs_clusters_required = + OldInt64.div + (Int64.round_up size_in_clusters refs_per_cluster) + refs_per_cluster + in + (* Each cluster containing references consumes 8 bytes in the + refcount_table. How much space is that? *) + let refcount_table_bytes = OldInt64.mul refs_clusters_required 8L in + OldInt64.div (Int64.round_up refcount_table_bytes cluster_size) cluster_size + +let l2_tables_required ~cluster_bits size = + (* The L2 table is of size (1L <| cluster_bits) bytes + and contains (1L <| (cluster_bits - 3)) 8-byte pointers. + A single L2 table therefore manages + (1L <| (cluster_bits - 3)) * (1L <| cluster_bits) bytes + = (1L <| (2 * cluster_bits - 3)) bytes. *) + let bytes_per_l2 = 1L <| (2 * cluster_bits) - 3 in + Int64.div (Int64.round_up size bytes_per_l2) bytes_per_l2 diff --git a/ocaml/qcow-tool/lib/qcow_header.mli b/ocaml/qcow-tool/lib/qcow_header.mli new file mode 100644 index 00000000000..e22fc816873 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_header.mli @@ -0,0 +1,99 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +module Version : sig + type t = [`One | `Two | `Three] [@@deriving sexp] + + include Qcow_s.SERIALISABLE with type t := t + + val compare : t -> t -> int +end + +module CryptMethod : sig + type t = [`Aes | `None] [@@deriving sexp] + + include Qcow_s.SERIALISABLE with type t := t + + val compare : t -> t -> int +end + +module Feature : sig + type ty = [`Incompatible | `Compatible | `Autoclear] + + type feature = [`Corrupt | `Dirty | `Lazy_refcounts | `Unknown of string] + + type t = {ty: ty; bit: int; feature: feature} + + val understood : t list + (** The features understood by this implementation *) + + include Qcow_s.SERIALISABLE with type t := t +end + +(** Offset within the image *) +type offset = int64 + +type extension = + [ `Unknown of int32 * string + | `Backing_file of string + | `Feature_name_table of Feature.t list ] +[@@deriving sexp] + +(** Version 3 and above have additional header fields *) +type additional = { + dirty: bool + ; corrupt: bool + ; lazy_refcounts: bool + ; autoclear_features: int64 + ; refcount_order: int32 +} +[@@deriving sexp] + +(** The qcow2 header *) +type t = { + version: Version.t + ; backing_file_offset: offset (** offset of the backing file path *) + ; backing_file_size: int32 (** length of the backing file path *) + ; cluster_bits: int32 (** a cluster is 2 ** cluster_bits in size *) + ; size: int64 (** virtual size of the image *) + ; crypt_method: CryptMethod.t + ; l1_size: int32 (** number of 8-byte entries in the L1 table *) + ; l1_table_offset: Qcow_physical.t (** offset of the L1 table *) + ; refcount_table_offset: Qcow_physical.t (** offset of the refcount table *) + ; refcount_table_clusters: int32 + (** size of the refcount table in clusters *) + ; nb_snapshots: int32 (** the number of internal snapshots *) + ; snapshots_offset: offset (** offset of the snapshot header *) + ; additional: additional option (** for version 3 or higher *) + ; extensions: extension list (** for version 3 or higher *) +} +[@@deriving sexp] + +val refcounts_per_cluster : t -> int64 +(** The number of 16-bit reference counts per cluster *) + +val max_refcount_table_size : t -> int64 +(** Compute the maximum size of the refcount table *) + +val l2_tables_required : cluster_bits:int -> int64 -> int64 +(** Compute the number of L2 tables required for this size of image *) + +include Qcow_s.SERIALISABLE with type t := t + +include Qcow_s.PRINTABLE with type t := t + +include Set.OrderedType with type t := t diff --git a/ocaml/qcow-tool/lib/qcow_int.ml b/ocaml/qcow-tool/lib/qcow_int.ml new file mode 100644 index 00000000000..733169623bf --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_int.ml @@ -0,0 +1,61 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Sexplib.Std + +module M = struct + type t = int [@@deriving sexp] + + let zero = 0 + + let succ x = x + 1 + + let pred x = x - 1 + + let add x y = x + y + + let sub x y = x - y + + let compare (x : t) (y : t) = Stdlib.compare x y + + let mul x y = x * y + + let div x y = x / y + + let to_int64 = Int64.of_int + + let of_int64 = Int64.to_int + + let to_int x = x + + let of_int x = x + + let to_string = string_of_int + + let shift_left x n = x lsl n + + let shift_right_logical x n = x lsr n + + let logor x y = x lor y + + let rem x y = x mod y +end + +module IntervalSet = Qcow_diet.Make (M) +module Map = Map.Make (M) +include M + +let round_up x size = mul (div (add x (pred size)) size) size diff --git a/ocaml/qcow-tool/lib/qcow_int.mli b/ocaml/qcow-tool/lib/qcow_int.mli new file mode 100644 index 00000000000..e3e37d69fdc --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_int.mli @@ -0,0 +1,32 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +(** Parsers and printers for types used in qcow2 fields *) + +type t = int [@@deriving sexp] + +include Qcow_s.NUM with type t := t + +val of_int64 : int64 -> t + +val to_int64 : t -> int64 + +val round_up : t -> t -> t +(** [round_up value to] rounds [value] to the next multiple of [to] *) + +module IntervalSet : Qcow_s.INTERVAL_SET with type elt = t + +module Map : Map.S with type key = t diff --git a/ocaml/qcow-tool/lib/qcow_int64.ml b/ocaml/qcow-tool/lib/qcow_int64.ml new file mode 100644 index 00000000000..03c89e78620 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_int64.ml @@ -0,0 +1,58 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Sexplib.Std +open Qcow_error + +let big_enough_for name buf needed = + let length = Cstruct.length buf in + if length < needed then + error_msg "%s: buffer too small (%d < %d)" name length needed + else + return () + +module M = struct + include Int64 + + type _t = int64 [@@deriving sexp] + + let sexp_of_t = sexp_of__t + + let t_of_sexp = _t_of_sexp + + let to_int64 x = x + + let of_int64 x = x +end + +module IntervalSet = Qcow_diet.Make (M) +module Map = Map.Make (M) +include M + +let round_up x size = mul (div (add x (pred size)) size) size + +let round_down x size = mul (div x size) size + +let sizeof _ = 8 + +let read buf = + big_enough_for "Int64.read" buf 8 >>= fun () -> + return (Cstruct.BE.get_uint64 buf 0, Cstruct.shift buf 8) + +let write t buf = + big_enough_for "Int64.read" buf 8 >>= fun () -> + Cstruct.BE.set_uint64 buf 0 t ; + return (Cstruct.shift buf 8) diff --git a/ocaml/qcow-tool/lib/qcow_int64.mli b/ocaml/qcow-tool/lib/qcow_int64.mli new file mode 100644 index 00000000000..82229f78d33 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_int64.mli @@ -0,0 +1,41 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +(** Parsers and printers for types used in qcow2 fields *) + +open Sexplib + +include module type of Int64 + +val t_of_sexp : Sexp.t -> t + +val sexp_of_t : t -> Sexp.t + +val of_int64 : int64 -> t + +val to_int64 : t -> int64 + +val round_up : int64 -> int64 -> int64 +(** [round_up value to] rounds [value] to the next multiple of [to] *) + +val round_down : int64 -> int64 -> int64 +(** [round_down value to] rounds [value] down to the multiple of [to] *) + +module IntervalSet : Qcow_s.INTERVAL_SET with type elt = t + +module Map : Map.S with type key = t + +include Qcow_s.SERIALISABLE with type t := t diff --git a/ocaml/qcow-tool/lib/qcow_locks.ml b/ocaml/qcow-tool/lib/qcow_locks.ml new file mode 100644 index 00000000000..a949ed92dbb --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_locks.ml @@ -0,0 +1,137 @@ +(* + * Copyright (C) 2017 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Qcow_types + +let src = + let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in + Logs.Src.set_level src (Some Logs.Info) ; + src + +module Log = (val Logs.src_log src : Logs.LOG) + +type t = { + mutable locks: (Qcow_rwlock.t * int) Cluster.Map.t + ; metadata_m: Lwt_mutex.t (** held during metadata changing operations *) +} + +module Client = Qcow_rwlock.Client + +let make () = + let locks = Cluster.Map.empty in + let metadata_m = Lwt_mutex.create () in + {locks; metadata_m} + +let with_metadata_lock t = Lwt_mutex.with_lock t.metadata_m + +let get_lock t cluster = + let lock, refcount = + if Cluster.Map.mem cluster t.locks then + Cluster.Map.find cluster t.locks + else + ( Qcow_rwlock.make (fun () -> + Printf.sprintf "cluster %s" (Cluster.to_string cluster) + ) + , 0 + ) + in + t.locks <- Cluster.Map.add cluster (lock, refcount + 1) t.locks ; + lock + +let put_lock t cluster = + (* put_lock is always called after get_lock *) + assert (Cluster.Map.mem cluster t.locks) ; + let lock, refcount = Cluster.Map.find cluster t.locks in + t.locks <- + ( if refcount = 1 then + Cluster.Map.remove cluster t.locks + else + Cluster.Map.add cluster (lock, refcount - 1) t.locks + ) + +let with_rwlock t cluster f = + let lock = get_lock t cluster in + Lwt.finalize + (fun () -> f lock) + (fun () -> put_lock t cluster ; Lwt.return_unit) + +type lock = {lock: Qcow_rwlock.lock; t: t; cluster: Cluster.t} + +let unlock lock = + Qcow_rwlock.unlock lock.lock ; + put_lock lock.t lock.cluster + +module Read = struct + let with_lock ?client t cluster f = + with_rwlock t cluster (fun rw -> Qcow_rwlock.Read.with_lock ?client rw f) + + let with_locks ?client t ~first ~last f = + let rec loop n = + if n > last then + f () + else + with_rwlock t n (fun rw -> + Qcow_rwlock.Read.with_lock ?client rw (fun () -> + loop (Cluster.succ n) + ) + ) + in + loop first + + let lock ?client t cluster = + let lock = get_lock t cluster in + let open Lwt.Infix in + Qcow_rwlock.Read.lock ?client lock >>= fun lock -> + Lwt.return {lock; t; cluster} +end + +module Write = struct + let with_lock ?client t cluster f = + with_rwlock t cluster (fun rw -> Qcow_rwlock.Write.with_lock ?client rw f) + + let with_locks ?client t ~first ~last f = + let rec loop n = + if n > last then + f () + else + with_rwlock t n (fun rw -> + Qcow_rwlock.Write.with_lock ?client rw (fun () -> + loop (Cluster.succ n) + ) + ) + in + loop first + + let try_lock ?client t cluster = + let lock = get_lock t cluster in + match Qcow_rwlock.Write.try_lock ?client lock with + | None -> + put_lock t cluster ; None + | Some lock -> + let lock = {lock; t; cluster} in + Some lock +end + +module Debug = struct + include Qcow_rwlock.Debug + + let dump_state t = + let locks = List.map fst @@ List.map snd @@ Cluster.Map.bindings t.locks in + Log.info (fun f -> + f "%s" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ Qcow_rwlock.sexp_of_ts locks) + ) +end diff --git a/ocaml/qcow-tool/lib/qcow_locks.mli b/ocaml/qcow-tool/lib/qcow_locks.mli new file mode 100644 index 00000000000..68a53ee7056 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_locks.mli @@ -0,0 +1,95 @@ +(* + * Copyright (C) 2017 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Qcow_types + +(** A set of per-cluster read and write locks *) +type t + +val make : unit -> t +(** Create a set of locks *) + +(** A value which represents holding a lock *) +type lock + +val unlock : lock -> unit +(** [unlock lock] releases the lock. Note releasing the same lock more than + once will trigger a runtime failure. *) + +module Client : sig + (** An entity which holds a set of locks *) + type t + + val make : (unit -> string) -> t + (** [make describe_fn] creates an entity where [describe_fn ()] returns + a human-readable description of the client for use in debugging. *) +end + +module Read : sig + (** Non-exclusive read locks *) + + val with_lock : + ?client:Client.t -> t -> Cluster.t -> (unit -> 'a Lwt.t) -> 'a Lwt.t + (** [with_lock t f] executes [f ()] with the lock held for reading *) + + val with_locks : + ?client:Client.t + -> t + -> first:Cluster.t + -> last:Cluster.t + -> (unit -> 'a Lwt.t) + -> 'a Lwt.t + (** [with_locks t ~first ~last f] executes [f ()] with all clusters in the + interval [first .. last] inclusive locked for reading. *) + + val lock : ?client:Client.t -> t -> Cluster.t -> lock Lwt.t + (** [lock t cluster] acquire a non-exclusive read lock on [cluster]. The + resulting lock must be released by calling [unlock] *) +end + +module Write : sig + (** Exclusive write locks *) + + val with_lock : + ?client:Client.t -> t -> Cluster.t -> (unit -> 'a Lwt.t) -> 'a Lwt.t + (** [with_lock t f] executes [f ()] with the lock held for writing *) + + val with_locks : + ?client:Client.t + -> t + -> first:Cluster.t + -> last:Cluster.t + -> (unit -> 'a Lwt.t) + -> 'a Lwt.t + (** [with_locks t ~first ~last f] executes [f ()] with all clusters in the + interval [first .. last] inclusive locked for writing. *) + + val try_lock : ?client:Client.t -> t -> Cluster.t -> lock option + (** [try_lock ?client t cluster] returns a write lock on [cluster] if it can + be done without blocking, or returns None. *) +end + +val with_metadata_lock : t -> (unit -> 'a Lwt.t) -> 'a Lwt.t +(** [with_metadata_lock t f] executes [f ()] with the global metadata lock held. + This prevents metadata blocks from moving while they're being used. *) + +module Debug : sig + val assert_no_locks_held : Client.t -> unit + (** Check that all locks have been explicitly released. *) + + val dump_state : t -> unit + (** Write the cluster lock state to the logs for analysis *) +end diff --git a/ocaml/qcow-tool/lib/qcow_metadata.ml b/ocaml/qcow-tool/lib/qcow_metadata.ml new file mode 100644 index 00000000000..0baf827a885 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_metadata.ml @@ -0,0 +1,153 @@ +(* + * Copyright (C) 2017 Docker Inc + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +(** An in-memory cache of metadata clusters used to speed up lookups. + + Cache entries may be `read` or `update`d with a lock held to block + concurrent access. +*) + +open Qcow_types + +let src = + let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in + Logs.Src.set_level src (Some Logs.Info) ; + src + +module Log = (val Logs.src_log src : Logs.LOG) + +module Lwt_error = Qcow_error.Lwt_error +module Lwt_write_error = Qcow_error.Lwt_write_error +module Cache = Qcow_cache +module Locks = Qcow_locks + +type error = [Mirage_block.error | `Msg of string] + +type write_error = [Mirage_block.write_error | `Msg of string] + +type t = { + cache: Cache.t + ; locks: Locks.t + ; mutable cluster_map: Qcow_cluster_map.t option (* free/ used space map *) + ; cluster_bits: int + ; m: Lwt_mutex.t + ; c: unit Lwt_condition.t +} + +type contents = {t: t; data: Cstruct.t; cluster: Cluster.t} + +module Refcounts = struct + type t = contents + + let of_contents x = x + + let get t n = Cstruct.BE.get_uint16 t.data (2 * n) + + let set t n v = Cstruct.BE.set_uint16 t.data (2 * n) v +end + +module Physical = struct + type t = contents + + let of_contents x = x + + let get t n = Qcow_physical.read (Cstruct.shift t.data (8 * n)) + + let set t n v = + ( match t.t.cluster_map with + | Some m -> + (* Find the block currently being referenced so it can be marked + as free. *) + let existing = Qcow_physical.read (Cstruct.shift t.data (8 * n)) in + let cluster = + Qcow_physical.cluster ~cluster_bits:t.t.cluster_bits existing + in + let v' = Qcow_physical.cluster ~cluster_bits:t.t.cluster_bits v in + Log.debug (fun f -> + f "Physical.set %s:%d -> %s%s" + (Cluster.to_string t.cluster) + n + ( if v = Qcow_physical.unmapped then + "unmapped" + else + Cluster.to_string v' + ) + ( if cluster <> Cluster.zero then + ", unmapping " ^ Cluster.to_string cluster + else + "" + ) + ) ; + if cluster <> Cluster.zero then + Qcow_cluster_map.remove m cluster ; + Qcow_cluster_map.add m (t.cluster, n) v' + | None -> + () + ) ; + Qcow_physical.write v (Cstruct.shift t.data (8 * n)) + + let len t = Cstruct.length t.data / 8 +end + +let erase cluster = Cstruct.memset cluster.data 0 + +let make ~cache ~cluster_bits ~locks () = + let m = Lwt_mutex.create () in + let c = Lwt_condition.create () in + let cluster_map = None in + {cache; cluster_map; cluster_bits; locks; m; c} + +let set_cluster_map t cluster_map = t.cluster_map <- Some cluster_map + +let read_and_lock ?client t cluster = + let open Lwt.Infix in + Locks.Read.lock ?client t.locks cluster >>= fun lock -> + let open Lwt_error.Infix in + Cache.read t.cache cluster >>= fun data -> + Lwt.return (Ok ({t; data; cluster}, lock)) + +(** Read the contents of [cluster] and apply the function [f] with the + lock held. *) +let read ?client t cluster f = + let open Lwt_error.Infix in + Locks.Read.with_lock ?client t.locks cluster (fun () -> + Cache.read t.cache cluster >>= fun data -> f {t; data; cluster} + ) + +(** Read the contents of [cluster], transform it via function [f] and write + back the results, all with the lock held. *) +let update ?client t cluster f = + let open Lwt_write_error.Infix in + Locks.Write.with_lock ?client t.locks cluster (fun () -> + (* Cancel any in-progress move since the data will be stale *) + ( match t.cluster_map with + | Some cluster_map -> + Qcow_cluster_map.cancel_move cluster_map cluster + | None -> + () + ) ; + Cache.read t.cache cluster >>= fun data -> + f {t; data; cluster} >>= fun result -> + let open Lwt.Infix in + Cache.write t.cache cluster data >>= function + | Error `Is_read_only -> + Lwt.return (Error `Is_read_only) + | Error `Disconnected -> + Lwt.return (Error `Disconnected) + | Ok () -> + Lwt.return (Ok result) + ) diff --git a/ocaml/qcow-tool/lib/qcow_metadata.mli b/ocaml/qcow-tool/lib/qcow_metadata.mli new file mode 100644 index 00000000000..336aa7f0123 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_metadata.mli @@ -0,0 +1,93 @@ +(* + * Copyright (C) 2017 Docker Inc + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Qcow_types + +(** Qcow metadata: clusters containing references and clusters containing + reference counts. *) +type t + +type error = [Mirage_block.error | `Msg of string] + +type write_error = [Mirage_block.write_error | `Msg of string] + +val make : + cache:Qcow_cache.t -> cluster_bits:int -> locks:Qcow_locks.t -> unit -> t +(** Construct a qcow metadata structure given a set of cluster read/write/flush + operations *) + +val set_cluster_map : t -> Qcow_cluster_map.t -> unit +(** Set the associated cluster map (which will be updated on every cluster + write) *) + +type contents + +module Refcounts : sig + (** A cluster full of 16bit refcounts *) + type t + + val of_contents : contents -> t + (** Interpret the given cluster as a refcount cluster *) + + val get : t -> int -> int + (** [get t n] return the [n]th refcount within [t] *) + + val set : t -> int -> int -> unit + (** [set t n v] set the [n]th refcount within [t] to [v] *) +end + +module Physical : sig + (** A cluster full of 64 bit cluster pointers *) + type t + + val of_contents : contents -> t + (** Interpret the given cluster as a cluster of 64 bit pointers *) + + val get : t -> int -> Qcow_physical.t + (** [get t n] return the [n]th physical address within [t] *) + + val set : t -> int -> Qcow_physical.t -> unit + (** [set t n v] set the [n]th physical address within [t] to [v] *) + + val len : t -> int + (** [len t] returns the number of physical addresses within [t] *) +end + +val erase : contents -> unit +(** Set the cluster contents to zeroes *) + +val read_and_lock : + ?client:Qcow_locks.Client.t + -> t + -> Cluster.t + -> (contents * Qcow_locks.lock, error) result Lwt.t + +val read : + ?client:Qcow_locks.Client.t + -> t + -> Cluster.t + -> (contents -> ('a, error) result Lwt.t) + -> ('a, error) result Lwt.t +(** Read the contents of the given cluster and provide them to the given function *) + +val update : + ?client:Qcow_locks.Client.t + -> t + -> Cluster.t + -> (contents -> ('a, write_error) result Lwt.t) + -> ('a, write_error) result Lwt.t +(** Read the contents of the given cluster, transform them through the given + function and write the results back to disk *) diff --git a/ocaml/qcow-tool/lib/qcow_padded.ml b/ocaml/qcow-tool/lib/qcow_padded.ml new file mode 100644 index 00000000000..46ff318e8ea --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_padded.ml @@ -0,0 +1,73 @@ +(* + * Copyright (C) 2017 Docker Inc + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +module Cstructs = Qcow_cstructs + +module Make (B : Qcow_s.RESIZABLE_BLOCK) = struct + include B + + let handle_error = function + | `Disconnected -> + Lwt.return (Error `Disconnected) + | _ -> + Format.kasprintf Lwt.fail_with "Unknown error in qcow_paddle.ml" + + let read base base_sector buf = + let open Lwt in + B.get_info base >>= fun base_info -> + let buf_len = Int64.of_int (Cstructs.len buf) in + let missing_sectors = + Int64.sub + Int64.( + add base_sector + (div buf_len (of_int base_info.Mirage_block.sector_size)) + ) + base_info.Mirage_block.size_sectors + in + if missing_sectors > 0L then ( + let available_sectors = + Int64.( + sub + (div buf_len (of_int base_info.Mirage_block.sector_size)) + missing_sectors + ) + in + let bytes = + Int64.( + to_int + (mul available_sectors (of_int base_info.Mirage_block.sector_size)) + ) + in + let open Lwt.Infix in + ( if bytes > 0 then + B.read base base_sector (Cstructs.sub buf 0 bytes) + else + Lwt.return (Ok ()) + ) + >>= function + | Error e -> + handle_error e + | Ok () -> + Cstructs.(memset (shift buf (max 0 bytes)) 0) ; + Lwt.return (Ok ()) + ) else + B.read base base_sector buf >>= function + | Error e -> + handle_error e + | Ok () -> + Lwt.return (Ok ()) +end diff --git a/ocaml/qcow-tool/lib/qcow_padded.mli b/ocaml/qcow-tool/lib/qcow_padded.mli new file mode 100644 index 00000000000..5834719085a --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_padded.mli @@ -0,0 +1,23 @@ +(* + * Copyright (C) 2017 Docker Inc + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +module Make (B : Qcow_s.RESIZABLE_BLOCK) : sig + (** A block device which is padded with virtual zeroes so that reads beyond + the current end don't fail. *) + + include Qcow_s.RESIZABLE_BLOCK with type t = B.t +end diff --git a/ocaml/qcow-tool/lib/qcow_physical.ml b/ocaml/qcow-tool/lib/qcow_physical.ml new file mode 100644 index 00000000000..2c1881b559a --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_physical.ml @@ -0,0 +1,95 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +open Sexplib.Std +open Qcow_types + +let ( <| ) = Cluster.shift_left + +let ( |> ) = Cluster.shift_right_logical + +type t = Cluster.t (* the encoded form on the disk *) + +let unmapped = Cluster.zero + +let one = Cluster.succ Cluster.zero + +let make ?(is_mutable = false) ?(is_compressed = false) x = + let x = Cluster.of_int x in + let bytes = x <| 2 |> 2 in + let is_mutable = if is_mutable then one <| 63 else Cluster.zero in + let is_compressed = if is_compressed then one <| 62 else Cluster.zero in + Cluster.(logor (logor bytes is_mutable) is_compressed) + +let is_mutable t = t |> 63 <> Cluster.zero + +let is_compressed t = t <| 1 |> 63 <> Cluster.zero + +let shift t bytes = + let bytes = Cluster.of_int bytes in + let bytes' = t <| 2 |> 2 in + let is_mutable = is_mutable t in + let is_compressed = is_compressed t in + make ~is_mutable ~is_compressed Cluster.(to_int @@ add bytes' bytes) + +let sector ~sector_size t = + let x = t <| 2 |> 2 in + Cluster.(to_int64 @@ div x (of_int sector_size)) + +(* Take an offset and round it down to the nearest physical sector, returning + the sector number and an offset within the sector *) +let to_sector ~sector_size t = + let x = t <| 2 |> 2 in + ( Cluster.(to_int64 @@ div x (of_int sector_size)) + , Cluster.(to_int (rem x (of_int sector_size))) + ) + +let to_bytes t = Cluster.to_int (t <| 2 |> 2) + +let add x y = Cluster.add x (Cluster.of_int y) + +let cluster ~cluster_bits t = + let x = t <| 2 |> 2 in + Cluster.(div x (one <| cluster_bits)) + +let within_cluster ~cluster_bits t = + let x = t <| 2 |> 2 in + Cluster.(to_int (rem x (one <| cluster_bits))) / 8 + +let read rest = Cluster.of_int64 @@ Cstruct.BE.get_uint64 rest 0 + +let write t rest = + let t = Cluster.to_int64 t in + Cstruct.BE.set_uint64 rest 0 t + +type _t = {bytes: Cluster.t; is_mutable: bool; is_compressed: bool} +[@@deriving sexp] + +let sexp_of_t t = + let bytes = t <| 2 |> 2 in + let is_mutable = is_mutable t in + let is_compressed = is_compressed t in + let _t = {bytes; is_mutable; is_compressed} in + sexp_of__t _t + +let t_of_sexp s = + let _t = _t_of_sexp s in + let is_mutable = if _t.is_mutable then one <| 63 else Cluster.zero in + let is_compressed = if _t.is_compressed then one <| 62 else Cluster.zero in + Cluster.(logor (logor _t.bytes is_mutable) is_compressed) + +let to_string t = Sexplib.Sexp.to_string (sexp_of_t t) diff --git a/ocaml/qcow-tool/lib/qcow_physical.mli b/ocaml/qcow-tool/lib/qcow_physical.mli new file mode 100644 index 00000000000..86ccda8abf9 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_physical.mli @@ -0,0 +1,65 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +open Qcow_types + +(** A physical address within the backing disk *) +type t [@@deriving sexp] + +val is_compressed : t -> bool +(** True if the address has been marked as being compressed *) + +val is_mutable : t -> bool +(** True if the offset is safe to mutate directly (i.e. is not referenced + by a snapshot *) + +val unmapped : t +(** An unmapped physical address *) + +val shift : t -> int -> t +(** [shift t bytes] adds [bytes] to t, maintaining other properties *) + +val make : ?is_mutable:bool -> ?is_compressed:bool -> int -> t +(** Create an address at the given byte offset. This defaults to [is_mutable = true] + which meand there are no snapshots implying that directly writing to this + offset is ok; and [is_compressed = false]. *) + +val add : t -> int -> t +(** Add a byte offset to a physical address *) + +val to_sector : sector_size:int -> t -> int64 * int +(** Return the sector on disk, plus a remainder within the sector *) + +val sector : sector_size:int -> t -> int64 +(** Return the sector on disk containing the address *) + +val to_bytes : t -> int +(** Return the byte offset on disk *) + +val cluster : cluster_bits:int -> t -> Cluster.t +(** Return the cluster containing the address *) + +val within_cluster : cluster_bits:int -> t -> int +(** Return the index within the cluster of the address *) + +val read : Cstruct.t -> t +(** Read a [t] from the given buffer *) + +val write : t -> Cstruct.t -> unit +(** Write [t] to the buffer *) + +include Qcow_s.PRINTABLE with type t := t diff --git a/ocaml/qcow-tool/lib/qcow_recycler.ml b/ocaml/qcow-tool/lib/qcow_recycler.ml new file mode 100644 index 00000000000..aaa7c326eee --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_recycler.ml @@ -0,0 +1,810 @@ +(* Securely erase and then recycle clusters *) + +let src = + let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in + Logs.Src.set_level src (Some Logs.Info) ; + src + +module Log = (val Logs.src_log src : Logs.LOG) + +open Qcow_types + +let ( <| ) = Int64.shift_left + +let ( |> ) = Int64.shift_right + +module Cache = Qcow_cache +module Error = Qcow_error +module Locks = Qcow_locks +module Metadata = Qcow_metadata +module Physical = Qcow_physical + +module Make (B : Qcow_s.RESIZABLE_BLOCK) (Time : Mirage_time.S) = struct + type t = { + base: B.t + ; sector_size: int + ; cluster_bits: int + ; mutable cluster_map: Qcow_cluster_map.t option (* free/ used space map *) + ; cache: Cache.t + ; locks: Locks.t + ; metadata: Metadata.t + ; zero_buffer: Cstruct.t + ; mutable background_thread: unit Lwt.t + ; mutable need_to_flush: bool + ; need_to_flush_c: unit Lwt_condition.t + ; flush_m: Lwt_mutex.t + ; runtime_asserts: bool + } + + let create ~base ~sector_size ~cluster_bits ~cache ~locks ~metadata + ~runtime_asserts = + let zero_buffer = Io_page.(to_cstruct @@ get 256) in + (* 1 MiB *) + Cstruct.memset zero_buffer 0 ; + let background_thread = Lwt.return_unit in + let flush_m = Lwt_mutex.create () in + let cluster_map = None in + let need_to_flush = false in + let need_to_flush_c = Lwt_condition.create () in + { + base + ; sector_size + ; cluster_bits + ; cluster_map + ; cache + ; locks + ; metadata + ; zero_buffer + ; background_thread + ; need_to_flush + ; need_to_flush_c + ; flush_m + ; runtime_asserts + } + + let set_cluster_map t cluster_map = t.cluster_map <- Some cluster_map + + let allocate t n = + let cluster_map = + match t.cluster_map with Some x -> x | None -> assert false + in + match + Cluster.IntervalSet.take (Qcow_cluster_map.Available.get cluster_map) n + with + | Some (set, _free) -> + Log.debug (fun f -> + f "Allocated %s clusters from free list: %s" (Cluster.to_string n) + (Sexplib.Sexp.to_string_hum ~indent:2 + @@ Cluster.IntervalSet.sexp_of_t set + ) + ) ; + Qcow_cluster_map.(set_cluster_state cluster_map set Available Roots) ; + Some set + | None -> + None + + let copy_already_locked t src dst = + let src = Cluster.to_int64 src and dst = Cluster.to_int64 dst in + let cluster_map = + match t.cluster_map with Some x -> x | None -> assert false + in + Log.debug (fun f -> f "Copy cluster %Ld to %Ld" src dst) ; + let npages = 1 lsl (t.cluster_bits - 12) in + let pages = Io_page.(to_cstruct @@ get npages) in + let cluster = Cstruct.sub pages 0 (1 lsl t.cluster_bits) in + + let sectors_per_cluster = + Int64.(div (1L <| t.cluster_bits) (of_int t.sector_size)) + in + + let src_sector = Int64.mul src sectors_per_cluster in + let dst_sector = Int64.mul dst sectors_per_cluster in + let open Lwt.Infix in + B.read t.base src_sector [cluster] >>= function + | Error `Disconnected -> + Lwt.return (Error `Disconnected) + | Error e -> + Format.kasprintf Lwt.fail_with "Unknown error: %a" B.pp_error e + | Ok () -> ( + B.write t.base dst_sector [cluster] >>= function + | Error `Disconnected -> + Lwt.return (Error `Disconnected) + | Error `Is_read_only -> + Lwt.return (Error `Is_read_only) + | Error e -> + Format.kasprintf Lwt.fail_with "Unknown error: %a" B.pp_write_error + e + | Ok () -> + let dst' = Cluster.of_int64 dst in + Cache.Debug.assert_not_cached t.cache dst' ; + if not @@ Qcow_cluster_map.Copies.mem cluster_map dst' then ( + Log.err (fun f -> + f "Copy cluster %Ld to %Ld: but %Ld is not Junk" src dst dst + ) ; + Qcow_cluster_map.Debug.assert_no_leaked_blocks cluster_map ; + assert false + ) ; + if Qcow_cluster_map.is_moving cluster_map dst' then ( + Log.err (fun f -> + f "Copy cluster from %Ld to %Ld: but %Ld is also moving" src + dst dst + ) ; + Qcow_cluster_map.Debug.assert_no_leaked_blocks cluster_map ; + assert false + ) ; + Lwt.return (Ok ()) + ) + + let copy t src dst = + Locks.Read.with_lock t.locks src (fun () -> + Locks.Write.with_lock t.locks dst (fun () -> + copy_already_locked t src dst + ) + ) + + let move t move = + let cluster_map = + match t.cluster_map with Some x -> x | None -> assert false + in + let src, dst = Qcow_cluster_map.Move.(move.src, move.dst) in + Log.debug (fun f -> + f "move %s -> %s" (Cluster.to_string src) (Cluster.to_string dst) + ) ; + let open Lwt.Infix in + Locks.Read.with_lock t.locks src (fun () -> + Locks.Write.with_lock t.locks dst (fun () -> + (* Consider that a discard might have arrived and removed the src + cluster. *) + if not (Qcow_cluster_map.is_moving cluster_map src) then ( + Log.info (fun f -> + f "Copy of cluster %s prevented: move operation cancelled" + (Cluster.to_string src) + ) ; + Lwt.return (Ok ()) + ) else + copy_already_locked t src dst >>= function + | Error `Disconnected -> + Lwt.return (Error `Disconnected) + | Error `Is_read_only -> + Lwt.return (Error `Is_read_only) + | Error _ -> + Format.kasprintf Lwt.fail_with + "Unknown error in qcow_recylcer.ml" + | Ok () -> + Qcow_cluster_map.(set_move_state cluster_map move Copied) ; + Lwt.return (Ok ()) + ) + ) + + let move_all ?(progress_cb = fun ~percent:_ -> ()) t moves = + let total = List.length moves in + let rec loop i = function + | [] -> + Lwt.return (Ok ()) + | m :: ms -> ( + let open Lwt.Infix in + move t m >>= function + | Error e -> + Lwt.return_error e + | Ok () -> + progress_cb ~percent:(100 * i / total) ; + loop (i + 1) ms + ) + in + loop 0 moves + + let erase t remaining = + let open Lwt.Infix in + let intervals = + Cluster.IntervalSet.fold (fun i acc -> i :: acc) remaining [] + in + let buffer_size_clusters = + Int64.of_int (Cstruct.length t.zero_buffer) |> t.cluster_bits + in + + Lwt_list.fold_left_s + (fun acc i -> + match acc with + | Error e -> + Lwt.return (Error e) + | Ok () -> + let x, y = Cluster.IntervalSet.Interval.(x i, y i) in + let x = Cluster.to_int64 x and y = Cluster.to_int64 y in + let n = Int64.(succ @@ sub y x) in + Log.debug (fun f -> f "erasing %Ld clusters (%Ld -> %Ld)" n x y) ; + let erase cluster n = + (* Erase [n] clusters starting from [cluster] *) + assert (n <= buffer_size_clusters) ; + let buf = + Cstruct.sub t.zero_buffer 0 (Int64.to_int (n <| t.cluster_bits)) + in + let sector = + Int64.(div (cluster <| t.cluster_bits) (of_int t.sector_size)) + in + (* No-one else is writing to this cluster so no locking is needed *) + B.write t.base sector [buf] + in + let rec loop from n m = + if n = 0L then + Lwt.return (Ok ()) + else if n > m then + erase from m >>= function + | Error e -> + Lwt.return (Error e) + | Ok () -> + loop (Int64.add from m) (Int64.sub n m) m + else + erase from n + in + loop x n buffer_size_clusters + ) + (Ok ()) intervals + + let update_references t = + let cluster_map = + match t.cluster_map with + | None -> + assert false (* by construction, see `make` *) + | Some x -> + x + in + let open Qcow_cluster_map in + (* Build a list of moves per referring cluster, so we can take the referring + cluster lock once, make all the updates and release it. *) + let flushed' = + Cluster.Map.fold + (fun src move acc -> + assert (src = move.Qcow_cluster_map.move.Qcow_cluster_map.Move.src) ; + match move.state with + | Flushed -> ( + match Qcow_cluster_map.find cluster_map src with + | exception Not_found -> + acc + | ref_cluster, _ -> + let existing = + if Cluster.Map.mem ref_cluster acc then + Cluster.Map.find ref_cluster acc + else + [] + in + Cluster.Map.add ref_cluster (move :: existing) acc + ) + | _ -> + acc + ) + (moves cluster_map) Cluster.Map.empty + in + let flushed = Cluster.Map.bindings flushed' in + + let nr_updated = ref 0L in + let open Lwt.Infix in + (* If I can't acquire a write lock on the metadata cluster then skip + this update and do it later. *) + let client = + Locks.Client.make (fun () -> "Rewriting references after a block copy") + in + Lwt_list.fold_left_s + (fun acc (ref_cluster', moves) -> + match acc with + | Error e -> + Lwt.return (Error e) + | Ok subst -> ( + let ref_cluster = + try Cluster.Map.find ref_cluster' subst + with Not_found -> ref_cluster' + in + match Locks.Write.try_lock ~client t.locks ref_cluster with + | None -> + List.iter + (fun {move= {Move.src; dst}; _} -> + Log.debug (fun f -> + f + "Not rewriting reference in %s from %s to %s: \ + metadata cluster is locked" + (Cluster.to_string ref_cluster) + (Cluster.to_string src) (Cluster.to_string dst) + ) ; + cancel_move cluster_map src + ) + moves ; + Lwt.return (Ok subst) + | Some lock -> + Lwt.finalize + (fun () -> + (* The flush function will call complete move for all moves with state Referenced. + However these won't actually have hit the disk until Metadata.update returns + and the disk write has been performed. *) + Lwt_mutex.with_lock t.flush_m (fun () -> + Metadata.update ~client t.metadata ref_cluster (fun c -> + Log.info (fun f -> + f "Updating %d references in cluster %s" + (List.length moves) + (Cluster.to_string ref_cluster) + ) ; + let addresses = Metadata.Physical.of_contents c in + try + let result = + List.fold_left + (fun acc ({move= {Move.src; dst}; _} as move) -> + match acc with + | Error e -> + Error e + | Ok subst -> ( + match + Qcow_cluster_map.find cluster_map src + with + | exception Not_found -> + (* Block was probably discarded after we started running. *) + Log.warn (fun f -> + f + "Not copying cluster %s to %s: \ + %s has been discarded" + (Cluster.to_string src) + (Cluster.to_string dst) + (Cluster.to_string src) + ) ; + Ok subst + | ref_cluster', ref_cluster_within -> + if ref_cluster' <> ref_cluster then ( + Log.info (fun f -> + f + "Reference to %s moved from \ + %s:%d to %s:%d" + (Cluster.to_string src) + (Cluster.to_string ref_cluster) + ref_cluster_within + (Cluster.to_string + ref_cluster' + ) + ref_cluster_within + ) ; + Ok subst + ) else if + not + (Cluster.Map.mem src + (Qcow_cluster_map.moves + cluster_map + ) + ) + then ( + Log.debug (fun f -> + f + "Not rewriting reference in \ + %s :%d from %s to %s: move \ + as been cancelled" + (Cluster.to_string ref_cluster) + ref_cluster_within + (Cluster.to_string src) + (Cluster.to_string dst) + ) ; + Ok subst + ) else + (* Read the current value in the referencing cluster as a sanity check *) + let old_reference = + Metadata.Physical.get addresses + ref_cluster_within + in + let old_cluster = + Qcow_physical.cluster + ~cluster_bits:t.cluster_bits + old_reference + in + if old_cluster <> src then ( + Log.err (fun f -> + f + "Rewriting reference in %s \ + :%d from %s to %s, old \ + reference actually \ + pointing to %s" + (Cluster.to_string + ref_cluster + ) + ref_cluster_within + (Cluster.to_string src) + (Cluster.to_string dst) + (Cluster.to_string + old_cluster + ) + ) ; + assert false + ) ; + Log.debug (fun f -> + f + "Rewriting reference in %s \ + :%d from %s to %s" + (Cluster.to_string ref_cluster) + ref_cluster_within + (Cluster.to_string src) + (Cluster.to_string dst) + ) ; + (* Preserve any flags but update the pointer *) + let dst' = + Cluster.to_int dst + lsl t.cluster_bits + in + let new_reference = + Qcow_physical.make + ~is_mutable: + (Qcow_physical.is_mutable + old_reference + ) + ~is_compressed: + (Qcow_physical.is_compressed + old_reference + ) + dst' + in + set_move_state cluster_map move.move + Referenced ; + Metadata.Physical.set addresses + ref_cluster_within new_reference ; + nr_updated := Int64.succ !nr_updated ; + (* The move cannot be cancelled now that the metadata has + been updated. *) + Ok (Cluster.Map.add src dst subst) + ) + ) + (Ok subst) moves + in + match result with + | Error e -> + Lwt.return (Error e) + | Ok subst -> + (* If `ref_cluster` is an L1 table entry then `src` must be an + L2 block, and the values in `cluster_map.refs` will point to it. + These need to be redirected to `dst` otherwise the `cluster_map` + will be out-of-sync. This only happens because we bypass the + `Metadata.Physical.set` function in the block copier. *) + if + Qcow_cluster_map.is_immovable cluster_map + ref_cluster + then ( + Log.info (fun f -> + f + "Cluster %s is L1: we must remap L2 \ + references" + (Cluster.to_string ref_cluster) + ) ; + Qcow_cluster_map.update_references + cluster_map subst + ) ; + Lwt.return (Ok subst) + with + | Error.Duplicate_reference + ((c, w), (c', w'), (target : int64)) as e -> + Log.err (fun f -> + f + "Duplicate_reference during \ + update_references of %s" + (String.concat ", " + @@ List.map + Qcow_cluster_map.string_of_move + @@ List.concat + @@ List.map snd flushed + ) + ) ; + let open Error.Lwt_write_error.Infix in + Qcow_debug.on_duplicate_reference t.metadata + cluster_map ~cluster_bits:t.cluster_bits + (c, w) (c', w') target + >>= fun () -> + Qcow_cluster_map.Debug.assert_no_leaked_blocks + cluster_map ; + Lwt.fail e + | e -> + Qcow_cluster_map.Debug.assert_no_leaked_blocks + cluster_map ; + raise e + ) + ) + ) + (fun () -> Locks.unlock lock ; Lwt.return_unit) + ) + ) + (Ok Cluster.Map.empty) flushed + >>= function + | Ok _subst -> + t.need_to_flush <- true ; + Lwt_condition.signal t.need_to_flush_c () ; + Lwt.return (Ok !nr_updated) + | Error e -> + Lwt.return (Error e) + + let flush t = + let open Qcow_cluster_map in + let cluster_map = + match t.cluster_map with + | None -> + assert false (* by construction, see `make` *) + | Some x -> + x + in + let open Lwt.Infix in + (* This can be called concurrently by both the user and by the background + flusher thread. *) + Lwt_mutex.with_lock t.flush_m (fun () -> + (* Anything erased right now will become available *) + let erased = Qcow_cluster_map.Erased.get cluster_map in + let moves = Qcow_cluster_map.moves cluster_map in + B.flush t.base >>= function + | Error e -> + Lwt.return (Error e) + | Ok () -> + (* Walk over the snapshot of moves before the flush and update. This + ensures we don't accidentally advance the state of moves which appeared + after the flush. *) + let nr_flushed, nr_completed = + Cluster.Map.fold + (fun _ (move : move) (nr_flushed, nr_completed) -> + match move.state with + | Copying | Flushed -> + (* no change *) + (nr_flushed, nr_completed) + | Copied -> + Qcow_cluster_map.( + set_move_state cluster_map move.move Flushed + ) ; + (nr_flushed + 1, nr_completed) + | Referenced -> + Qcow_cluster_map.complete_move cluster_map move.move ; + (nr_flushed, nr_completed + 1) + ) + moves (0, 0) + in + let nr_erased = + Cluster.to_int @@ Cluster.IntervalSet.cardinal erased + in + Qcow_cluster_map.( + set_cluster_state cluster_map erased Erased Available + ) ; + if nr_flushed <> 0 || nr_completed <> 0 || nr_erased <> 0 then ( + Log.info (fun f -> + f + "block recycler: %d cluster copies flushed; %d cluster \ + copies complete; %d clusters erased" + nr_flushed nr_completed nr_erased + ) ; + Log.info (fun f -> + f "block recycler: flush: %s" + (Qcow_cluster_map.to_summary_string cluster_map) + ) + ) ; + Lwt.return (Ok ()) + ) + + let start_background_thread t ~keep_erased ?compact_after_unmaps () = + let th, _ = Lwt.task () in + Lwt.on_cancel th (fun () -> + Log.info (fun f -> f "cancellation of block recycler not implemented") + ) ; + let cluster_map = + match t.cluster_map with Some x -> x | None -> assert false + in + Log.info (fun f -> + f "block recycler starting with keep_erased = %Ld" keep_erased + ) ; + let open Lwt.Infix in + let rec background_flusher () = + let rec wait () = + match t.need_to_flush with + | true -> + Lwt.return_unit + | false -> + Lwt_condition.wait t.need_to_flush_c >>= fun () -> wait () + in + wait () >>= fun () -> + t.need_to_flush <- false ; + Time.sleep_ns 5_000_000_000L >>= fun () -> + Log.info (fun f -> + f "block recycler: triggering background flush: %s" + (Qcow_cluster_map.to_summary_string cluster_map) + ) ; + flush t >>= function + | Error _ -> + Log.err (fun f -> f "block recycler: flush failed") ; + Lwt.return_unit + | Ok () -> + background_flusher () + in + Lwt.async background_flusher ; + + let last_block = ref (Qcow_cluster_map.get_last_block cluster_map) in + let rec wait_for_work () = + let junk = Qcow_cluster_map.Junk.get cluster_map in + let nr_junk = Cluster.to_int64 @@ Cluster.IntervalSet.cardinal junk in + let erased = Qcow_cluster_map.Erased.get cluster_map in + let nr_erased = Cluster.to_int64 @@ Cluster.IntervalSet.cardinal erased in + let available = Qcow_cluster_map.Available.get cluster_map in + let nr_available = + Cluster.to_int64 @@ Cluster.IntervalSet.cardinal available + in + (* Apply the threshold to the total clusters erased, which includes those + marked as available *) + let total_erased = Int64.add nr_erased nr_available in + (* Prioritise cluster reuse because it's more efficient not to have to + move a cluster at all U*) + let highest_priority = + if total_erased < keep_erased && nr_junk > 0L then + (* Take some of the junk and erase it *) + let n = + Cluster.of_int64 @@ min nr_junk (Int64.sub keep_erased total_erased) + in + if Cluster.IntervalSet.cardinal junk < n then + None + else + Some (`Erase n) + else + None + in + (* If we need to update references, do that next *) + let moves = Qcow_cluster_map.moves cluster_map in + let middle_priority = + let flushed = + Cluster.Map.fold + (fun _src move acc -> + match move.Qcow_cluster_map.state with + | Qcow_cluster_map.Flushed -> + true + | _ -> + acc + ) + moves false + in + if flushed then Some `Update_references else None + in + ( match (highest_priority, middle_priority, compact_after_unmaps) with + | Some x, _, _ -> + Lwt.return (Some x) + | _, Some x, _ -> + Lwt.return (Some x) + | None, _, Some x when x < nr_junk -> + if not (Cluster.Map.is_empty moves) then + Lwt.return None + else ( + (* Wait for the junk data to stabilise before starting to copy *) + Log.info (fun f -> + f + "Discards (%Ld) over threshold (%Ld): waiting for discards \ + to finish before beginning compaction" + nr_junk x + ) ; + let rec wait nr_junk n = + Time.sleep_ns 5_000_000_000L >>= fun () -> + let nr_junk' = + Cluster.to_int64 + @@ Cluster.IntervalSet.cardinal + @@ Qcow_cluster_map.Junk.get cluster_map + in + if nr_junk = nr_junk' then ( + Log.info (fun f -> + f "Discards have finished, %Ld clusters have been discarded" + nr_junk + ) ; + Lwt.return () + ) else ( + if n mod 60 = 0 then + Log.info (fun f -> + f "Total discards %Ld, still waiting" nr_junk' + ) ; + wait nr_junk' (n + 1) + ) + in + wait nr_junk 0 >>= fun () -> Lwt.return (Some `Junk) + ) + | _ -> + let last_block' = Qcow_cluster_map.get_last_block cluster_map in + let result = + if last_block' < !last_block then Some `Resize else None + in + last_block := last_block' ; + Lwt.return result + ) + >>= function + | None -> + Qcow_cluster_map.wait cluster_map >>= fun () -> wait_for_work () + | Some work -> + Lwt.return work + in + + let resize () = + Locks.with_metadata_lock t.locks (fun () -> + let new_last_block = + 1 + (Cluster.to_int @@ Qcow_cluster_map.get_last_block cluster_map) + in + Log.info (fun f -> + f "block recycler: resize to %d clusters" new_last_block + ) ; + let new_size = Physical.make (new_last_block lsl t.cluster_bits) in + let sector = Physical.sector ~sector_size:t.sector_size new_size in + let cluster = + Physical.cluster ~cluster_bits:t.cluster_bits new_size + in + Qcow_cluster_map.resize cluster_map cluster ; + B.resize t.base sector >>= function + | Error _ -> + Lwt.fail_with "resize" + | Ok () -> + Log.debug (fun f -> + f "Resized device to %d sectors of size %d" + (Qcow_physical.to_bytes new_size) + t.sector_size + ) ; + Lwt.return_unit + ) + in + let rec loop () = + t.need_to_flush <- true ; + Lwt_condition.signal t.need_to_flush_c () ; + (* trigger a flush later *) + wait_for_work () >>= function + | `Erase n -> ( + match + Cluster.IntervalSet.take (Qcow_cluster_map.Junk.get cluster_map) n + with + | None -> + loop () + | Some (to_erase, _) -> + Log.debug (fun f -> + f "block recycler: should erase %s clusters" + (Cluster.to_string @@ Cluster.IntervalSet.cardinal to_erase) + ) ; + Qcow_cluster_map.(set_cluster_state cluster_map to_erase Junk Roots) ; + Lwt.catch + (fun () -> + erase t to_erase >>= function + | Error e -> + Format.kasprintf Lwt.fail_with "%a" B.pp_write_error e + | Ok () -> + Qcow_cluster_map.( + set_cluster_state cluster_map to_erase Roots Erased + ) ; + Lwt.return_unit + ) + (fun e -> + Qcow_cluster_map.( + set_cluster_state cluster_map to_erase Roots Junk + ) ; + Lwt.fail e + ) + >>= fun () -> loop () + ) + | `Junk -> + if t.runtime_asserts then + Qcow_cluster_map.Debug.assert_no_leaked_blocks cluster_map ; + (* There must be no moves already in progress when starting new moves, otherwise + we might move the same block twice maybe even to a different location. *) + assert (Cluster.Map.is_empty @@ Qcow_cluster_map.moves cluster_map) ; + let junk = Qcow_cluster_map.Junk.get cluster_map in + let nr_junk = Cluster.to_int64 @@ Cluster.IntervalSet.cardinal junk in + let moves = Qcow_cluster_map.start_moves cluster_map in + Log.info (fun f -> + f "block recycler: %Ld clusters are junk, %d moves are possible" + nr_junk (List.length moves) + ) ; + Qcow_error.Lwt_write_error.or_fail_with @@ move_all t moves + >>= fun () -> + resize () >>= fun () -> loop () + | `Update_references -> ( + Log.info (fun f -> + f "block recycler: need to update references to blocks" + ) ; + update_references t >>= function + | Error (`Msg x) -> + Lwt.fail_with x + | Error `Disconnected -> + Lwt.fail_with "Disconnected" + | Error `Is_read_only -> + Lwt.fail_with "Is_read_only" + | Ok nr_updated -> + Log.info (fun f -> + f "block recycler: %Ld block references updated" nr_updated + ) ; + loop () + ) + | `Resize -> + resize () >>= fun () -> loop () + in + + Lwt.async loop ; + t.background_thread <- th +end diff --git a/ocaml/qcow-tool/lib/qcow_recycler.mli b/ocaml/qcow-tool/lib/qcow_recycler.mli new file mode 100644 index 00000000000..09b58e311fb --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_recycler.mli @@ -0,0 +1,65 @@ +(* + * Copyright (C) 2017 Docker Inc + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Qcow_types + +module Make (B : Qcow_s.RESIZABLE_BLOCK) (Time : Mirage_time.S) : sig + (** A cluster recycling engine *) + type t + + val create : + base:B.t + -> sector_size:int + -> cluster_bits:int + -> cache:Qcow_cache.t + -> locks:Qcow_locks.t + -> metadata:Qcow_metadata.t + -> runtime_asserts:bool + -> t + (** Initialise a cluster recycler over the given block device *) + + val set_cluster_map : t -> Qcow_cluster_map.t -> unit + (** Set the associated cluster map (which will be updated on every cluster + write) *) + + val start_background_thread : + t -> keep_erased:int64 -> ?compact_after_unmaps:int64 -> unit -> unit + (** Start a background thread which will perform block recycling *) + + val allocate : t -> Cluster.t -> Cluster.IntervalSet.t option + (** [allocate t n] returns [n] clusters which are ready for re-use. If there + are not enough clusters free then this returns None. *) + + val erase : t -> Cluster.IntervalSet.t -> (unit, B.write_error) result Lwt.t + (** Write zeroes over the specified set of clusters *) + + val copy : t -> Cluster.t -> Cluster.t -> (unit, B.write_error) result Lwt.t + (** [copy src dst] copies the cluster [src] to [dst] *) + + val move_all : + ?progress_cb:(percent:int -> unit) + -> t + -> Qcow_cluster_map.Move.t list + -> (unit, Qcow_metadata.write_error) result Lwt.t + (** [move_all t mv] perform the initial data copy of the move operations [mv] *) + + val update_references : t -> (int64, Qcow_metadata.write_error) result Lwt.t + (** [update_references t] rewrites references to any recently copied and + flushed block, returning the number of writes completed. *) + + val flush : t -> (unit, B.write_error) result Lwt.t + (** Issue a flush to the block device, update internal recycler state. *) +end diff --git a/ocaml/qcow-tool/lib/qcow_rwlock.ml b/ocaml/qcow-tool/lib/qcow_rwlock.ml new file mode 100644 index 00000000000..46b6980dd7d --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_rwlock.ml @@ -0,0 +1,205 @@ +(* + * Copyright (C) 2016 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Sexplib.Std + +let src = + let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in + Logs.Src.set_level src (Some Logs.Info) ; + src + +module Log = (val Logs.src_log src : Logs.LOG) + +(* A resource that can be locked *) +type t = { + t_description_fn: unit -> string + ; m: Lwt_mutex.t + ; c: unit Lwt_condition.t + ; mutable all_locks: lock list +} + +(* A lock held on a resource *) +and lock = { + t: t + ; client: client + ; mutable reader: bool (* or writer *) + ; mutable released: bool +} + +(* A client owning the lock *) +and client = { + client_description_fn: unit -> string + ; mutable my_locks: lock list +} + +type ts = t list + +let make t_description_fn = + let m = Lwt_mutex.create () in + let c = Lwt_condition.create () in + let all_locks = [] in + {t_description_fn; m; c; all_locks} + +module To_sexp = struct + (* Project instances of type t into a simpler set of records, organised for + printing. *) + module Lock = struct + type t = {description: string; mode: [`Read | `Write]; released: bool} + [@@deriving sexp_of] + end + + module Client = struct + type t = {description: string; locks: Lock.t list} [@@deriving sexp_of] + end + + type t = {description: string; clients: Client.t list} [@@deriving sexp_of] + + let rec setify eq = function + | [] -> + [] + | x :: xs -> + if List.filter (fun y -> eq x y) xs <> [] then + setify eq xs + else + x :: setify eq xs + + let lock l = + let description = l.t.t_description_fn () in + let mode = if l.reader then `Read else `Write in + let released = l.released in + {Lock.description; mode; released} + + let client c = + let description = c.client_description_fn () in + (* Make the per-client list easier to read by de-duplicating it *) + let locks = setify ( = ) @@ List.map lock c.my_locks in + {Client.description; locks} + + let t t = + let description = t.t_description_fn () in + let clients = + List.map client + @@ setify ( == ) + @@ List.map (fun l -> l.client) t.all_locks + in + {description; clients} + + type ts = Client.t list [@@deriving sexp_of] + + let ts ts = + let all_locks = List.concat @@ List.map (fun t -> t.all_locks) ts in + List.map client @@ setify ( == ) @@ List.map (fun l -> l.client) all_locks +end + +let sexp_of_t x = To_sexp.(sexp_of_t @@ t x) + +let sexp_of_ts xs = To_sexp.(sexp_of_ts @@ ts xs) + +let sexp_of_client x = To_sexp.(Client.sexp_of_t @@ client x) + +let anon_client = + let next_idx = ref 0 in + fun () -> + let idx = !next_idx in + incr next_idx ; + let client_description_fn () = Printf.sprintf "Anonymous client %d" idx in + let my_locks = [] in + {client_description_fn; my_locks} + +let unlock lock = + assert (not lock.released) ; + lock.released <- true ; + lock.client.my_locks <- List.filter (fun l -> l != lock) lock.client.my_locks ; + lock.t.all_locks <- List.filter (fun l -> l != lock) lock.t.all_locks ; + Lwt_condition.broadcast lock.t.c () + +let any f xs = List.fold_left (fun acc x -> acc || f x) false xs + +module Read = struct + let lock ?(client = anon_client ()) t = + let open Lwt.Infix in + Lwt_mutex.with_lock t.m (fun () -> + let rec wait () = + (* If any other client has a write lock then wait *) + let any_other_writer = + any (fun l -> l.client != client && not l.reader) t.all_locks + in + if any_other_writer then + Lwt_condition.wait t.c ~mutex:t.m >>= fun () -> wait () + else + let reader = true and released = false in + let lock = {t; client; reader; released} in + t.all_locks <- lock :: t.all_locks ; + client.my_locks <- lock :: client.my_locks ; + Lwt.return lock + in + wait () + ) + + let with_lock ?(client = anon_client ()) t f = + let open Lwt.Infix in + lock ~client t >>= fun lock -> + Lwt.finalize f (fun () -> unlock lock ; Lwt.return_unit) +end + +module Write = struct + let any_other_client t client = any (fun l -> l.client != client) t.all_locks + + let with_lock ?(client = anon_client ()) t f = + let open Lwt.Infix in + Lwt_mutex.with_lock t.m (fun () -> + let rec wait () = + (* If any other client has a lock then wait *) + if any_other_client t client then + Lwt_condition.wait t.c ~mutex:t.m >>= fun () -> wait () + else + let reader = false and released = false in + let lock = {t; client; reader; released} in + t.all_locks <- lock :: t.all_locks ; + client.my_locks <- lock :: client.my_locks ; + Lwt.return lock + in + wait () + ) + >>= fun lock -> Lwt.finalize f (fun () -> unlock lock ; Lwt.return_unit) + + let try_lock ?(client = anon_client ()) t = + if any_other_client t client then + None + else + let reader = false and released = false in + let lock = {t; client; reader; released} in + t.all_locks <- lock :: t.all_locks ; + client.my_locks <- lock :: client.my_locks ; + Some lock +end + +module Client = struct + type t = client + + let make client_description_fn = + let my_locks = [] in + {client_description_fn; my_locks} +end + +module Debug = struct + let assert_no_locks_held client = + if client.my_locks <> [] then ( + Printf.fprintf stderr "Client still holds locks:\n%s\n%!" + (Sexplib.Sexp.to_string_hum ~indent:2 @@ sexp_of_client client) ; + assert false + ) +end diff --git a/ocaml/qcow-tool/lib/qcow_rwlock.mli b/ocaml/qcow-tool/lib/qcow_rwlock.mli new file mode 100644 index 00000000000..06f67e4a511 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_rwlock.mli @@ -0,0 +1,69 @@ +(* + * Copyright (C) 2016 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +(** A lock which permits multiple concurrent threads to acquire it for reading + but demands exclusivity for writing *) +type t [@@deriving sexp_of] + +type ts = t list [@@deriving sexp_of] + +val make : (unit -> string) -> t +(** [make describe_fn] creates a new lock, where [describe_fn ()] returns a + human-readable description string suitable for debug output. *) + +(** A value which represents holding a lock *) +type lock + +val unlock : lock -> unit +(** [unlock locked] releases the lock associated with [locked] *) + +module Client : sig + (** An entity which holds a set of locks *) + type t + + val make : (unit -> string) -> t + (** [make describe_fn] creates an entity where [describe_fn ()] returns + a human-readable description of the client for use in debugging. *) +end + +module Read : sig + val with_lock : ?client:Client.t -> t -> (unit -> 'a Lwt.t) -> 'a Lwt.t + (** [with_lock ?client t f] executes [f ()] when no other client has held + the lock exclusively for writing. Note this means that I may hold the lock + for writing and then re-lock it for reading. + *) + + val lock : ?client:Client.t -> t -> lock Lwt.t + (** [lock ?client t] locks [t]. This function blocks while another client + holds the lock for writing. The lock must be released with [unlock] *) +end + +module Write : sig + val with_lock : ?client:Client.t -> t -> (unit -> 'a Lwt.t) -> 'a Lwt.t + (** [with_lock ?client t f] executes [f ()] when no-other client is holding + the lock for reading or writing. Note this means that I may hold the lock + for reading and then re-lock it for writing. *) + + val try_lock : ?client:Client.t -> t -> lock option + (** [try_lock ?client t] acquires a write lock on [t] if immediately possible, + or returns None *) +end + +module Debug : sig + val assert_no_locks_held : Client.t -> unit + (** Check that all locks have been explicitly released. *) +end diff --git a/ocaml/qcow-tool/lib/qcow_s.ml b/ocaml/qcow-tool/lib/qcow_s.ml new file mode 100644 index 00000000000..e8841e03ea2 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_s.ml @@ -0,0 +1,181 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +(** Common signatures used by the library *) + +open Result + +module type LOG = sig + (** Common logging functions *) + + val debug : ('a, unit, string, unit) format4 -> 'a + + val info : ('a, unit, string, unit) format4 -> 'a + + val error : ('a, unit, string, unit) format4 -> 'a +end + +module type SERIALISABLE = sig + (** Values which can be read and written *) + + (** Instances of this type can be read and written *) + type t + + val sizeof : t -> int + (** The size of a buffer needed to hold [t] *) + + val read : Cstruct.t -> (t * Cstruct.t, [`Msg of string]) result + (** Read a [t] from the given buffer and return it, along with the + unused remainder of the buffer. If the buffer cannot + be parsed then return an error.*) + + val write : t -> Cstruct.t -> (Cstruct.t, [`Msg of string]) result + (** Write a [t] into the given buffer. If the buffer is too small, + then return an error. Return the unused remainder of the buffer.*) +end + +module type PRINTABLE = sig + (** Values which can be pretty-printed *) + + (** Instances of this type can be pretty-printed *) + type t + + val to_string : t -> string + (** Produce a pretty human-readable string from a value *) +end + +module type RESIZABLE_BLOCK = sig + include Mirage_block.S + + val resize : t -> int64 -> (unit, write_error) result Lwt.t + (** Resize the file to the given number of sectors. *) + + val flush : t -> (unit, write_error) result Lwt.t + (** [flush t] flushes any buffers, if the file has been opened in buffered + mode *) +end + +module type INTERVAL_SET = sig + (** The type of the set elements *) + type elt + + (** An interval: a range (x, y) of set values where all the elements from + x to y inclusive are in the set *) + type interval + + module Interval : sig + val make : elt -> elt -> interval + (** [make first last] construct an interval describing all the elements from + [first] to [last] inclusive. *) + + val x : interval -> elt + (** the starting element of the interval *) + + val y : interval -> elt + (** the ending element of the interval *) + end + + (** The type of sets *) + type t [@@deriving sexp] + + val empty : t + (** The empty set *) + + val is_empty : t -> bool + (** Test whether a set is empty or not *) + + val cardinal : t -> elt + (** [cardinal t] is the number of elements in the set [t] *) + + val mem : elt -> t -> bool + (** [mem elt t] tests whether [elt] is in set [t] *) + + val fold : (interval -> 'a -> 'a) -> t -> 'a -> 'a + (** [fold f t acc] folds [f] across all the intervals in [t] *) + + val fold_s : (interval -> 'a -> 'a Lwt.t) -> t -> 'a -> 'a Lwt.t + (** [fold_s f t acc] folds [f] across all the intervals in [t] *) + + val fold_individual : (elt -> 'a -> 'a) -> t -> 'a -> 'a + (** [fold_individual f t acc] folds [f] across all the individual elements of [t] *) + + val add : interval -> t -> t + (** [add interval t] returns the set consisting of [t] plus [interval] *) + + val remove : interval -> t -> t + (** [remove interval t] returns the set consisting of [t] minus [interval] *) + + val min_elt : t -> interval + (** [min_elt t] returns the smallest (in terms of the ordering) interval in + [t], or raises [Not_found] if the set is empty. *) + + val max_elt : t -> interval + (** [max_elt t] returns the largest (in terms of the ordering) interval in + [t], or raises [Not_found] if the set is empty. *) + + val choose : t -> interval + (** [choose t] returns one interval, or raises Not_found if the set is empty *) + + val take : t -> elt -> (t * t) option + (** [take n] returns [Some a, b] where [cardinal a = n] and [diff t a = b] + or [None] if [cardinal t < n] *) + + val union : t -> t -> t + (** set union *) + + val diff : t -> t -> t + (** set difference *) + + val inter : t -> t -> t + (** set intersection *) +end + +module type NUM = sig + type t + + val zero : t + + val pred : t -> t + + val succ : t -> t + + val add : t -> t -> t + + val sub : t -> t -> t + + val mul : t -> t -> t + + val div : t -> t -> t + + val of_int64 : int64 -> t + + val to_int64 : t -> int64 + + val of_int : int -> t + + val to_int : t -> int + + val to_string : t -> string + + val shift_left : t -> int -> t + + val shift_right_logical : t -> int -> t + + val logor : t -> t -> t + + val rem : t -> t -> t +end diff --git a/ocaml/qcow-tool/lib/qcow_s.mli b/ocaml/qcow-tool/lib/qcow_s.mli new file mode 100644 index 00000000000..e8841e03ea2 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_s.mli @@ -0,0 +1,181 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +(** Common signatures used by the library *) + +open Result + +module type LOG = sig + (** Common logging functions *) + + val debug : ('a, unit, string, unit) format4 -> 'a + + val info : ('a, unit, string, unit) format4 -> 'a + + val error : ('a, unit, string, unit) format4 -> 'a +end + +module type SERIALISABLE = sig + (** Values which can be read and written *) + + (** Instances of this type can be read and written *) + type t + + val sizeof : t -> int + (** The size of a buffer needed to hold [t] *) + + val read : Cstruct.t -> (t * Cstruct.t, [`Msg of string]) result + (** Read a [t] from the given buffer and return it, along with the + unused remainder of the buffer. If the buffer cannot + be parsed then return an error.*) + + val write : t -> Cstruct.t -> (Cstruct.t, [`Msg of string]) result + (** Write a [t] into the given buffer. If the buffer is too small, + then return an error. Return the unused remainder of the buffer.*) +end + +module type PRINTABLE = sig + (** Values which can be pretty-printed *) + + (** Instances of this type can be pretty-printed *) + type t + + val to_string : t -> string + (** Produce a pretty human-readable string from a value *) +end + +module type RESIZABLE_BLOCK = sig + include Mirage_block.S + + val resize : t -> int64 -> (unit, write_error) result Lwt.t + (** Resize the file to the given number of sectors. *) + + val flush : t -> (unit, write_error) result Lwt.t + (** [flush t] flushes any buffers, if the file has been opened in buffered + mode *) +end + +module type INTERVAL_SET = sig + (** The type of the set elements *) + type elt + + (** An interval: a range (x, y) of set values where all the elements from + x to y inclusive are in the set *) + type interval + + module Interval : sig + val make : elt -> elt -> interval + (** [make first last] construct an interval describing all the elements from + [first] to [last] inclusive. *) + + val x : interval -> elt + (** the starting element of the interval *) + + val y : interval -> elt + (** the ending element of the interval *) + end + + (** The type of sets *) + type t [@@deriving sexp] + + val empty : t + (** The empty set *) + + val is_empty : t -> bool + (** Test whether a set is empty or not *) + + val cardinal : t -> elt + (** [cardinal t] is the number of elements in the set [t] *) + + val mem : elt -> t -> bool + (** [mem elt t] tests whether [elt] is in set [t] *) + + val fold : (interval -> 'a -> 'a) -> t -> 'a -> 'a + (** [fold f t acc] folds [f] across all the intervals in [t] *) + + val fold_s : (interval -> 'a -> 'a Lwt.t) -> t -> 'a -> 'a Lwt.t + (** [fold_s f t acc] folds [f] across all the intervals in [t] *) + + val fold_individual : (elt -> 'a -> 'a) -> t -> 'a -> 'a + (** [fold_individual f t acc] folds [f] across all the individual elements of [t] *) + + val add : interval -> t -> t + (** [add interval t] returns the set consisting of [t] plus [interval] *) + + val remove : interval -> t -> t + (** [remove interval t] returns the set consisting of [t] minus [interval] *) + + val min_elt : t -> interval + (** [min_elt t] returns the smallest (in terms of the ordering) interval in + [t], or raises [Not_found] if the set is empty. *) + + val max_elt : t -> interval + (** [max_elt t] returns the largest (in terms of the ordering) interval in + [t], or raises [Not_found] if the set is empty. *) + + val choose : t -> interval + (** [choose t] returns one interval, or raises Not_found if the set is empty *) + + val take : t -> elt -> (t * t) option + (** [take n] returns [Some a, b] where [cardinal a = n] and [diff t a = b] + or [None] if [cardinal t < n] *) + + val union : t -> t -> t + (** set union *) + + val diff : t -> t -> t + (** set difference *) + + val inter : t -> t -> t + (** set intersection *) +end + +module type NUM = sig + type t + + val zero : t + + val pred : t -> t + + val succ : t -> t + + val add : t -> t -> t + + val sub : t -> t -> t + + val mul : t -> t -> t + + val div : t -> t -> t + + val of_int64 : int64 -> t + + val to_int64 : t -> int64 + + val of_int : int -> t + + val to_int : t -> int + + val to_string : t -> string + + val shift_left : t -> int -> t + + val shift_right_logical : t -> int -> t + + val logor : t -> t -> t + + val rem : t -> t -> t +end diff --git a/ocaml/qcow-tool/lib/qcow_types.ml b/ocaml/qcow-tool/lib/qcow_types.ml new file mode 100644 index 00000000000..8b468395421 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_types.ml @@ -0,0 +1,86 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Sexplib.Std +open Qcow_error + +let big_enough_for name buf needed = + let length = Cstruct.length buf in + if length < needed then + error_msg "%s: buffer too small (%d < %d)" name length needed + else + return () + +module Int8 = struct + type t = int [@@deriving sexp] + + let sizeof _ = 1 + + let read buf = + big_enough_for "Int8.read" buf 1 >>= fun () -> + return (Cstruct.get_uint8 buf 0, Cstruct.shift buf 1) + + let write t buf = + big_enough_for "Int8.write" buf 1 >>= fun () -> + Cstruct.set_uint8 buf 0 t ; + return (Cstruct.shift buf 1) +end + +module Int16 = struct + type t = int [@@deriving sexp] + + let sizeof _ = 2 + + let read buf = + big_enough_for "Int16.read" buf 2 >>= fun () -> + return (Cstruct.BE.get_uint16 buf 0, Cstruct.shift buf 2) + + let write t buf = + big_enough_for "Int16.write" buf 2 >>= fun () -> + Cstruct.BE.set_uint16 buf 0 t ; + return (Cstruct.shift buf 2) +end + +module Int32 = struct + include Int32 + + type _t = int32 [@@deriving sexp] + + let sexp_of_t = sexp_of__t + + let t_of_sexp = _t_of_sexp + + let sizeof _ = 4 + + let read buf = + big_enough_for "Int32.read" buf 4 >>= fun () -> + return (Cstruct.BE.get_uint32 buf 0, Cstruct.shift buf 4) + + let write t buf = + big_enough_for "Int32.read" buf 4 >>= fun () -> + Cstruct.BE.set_uint32 buf 0 t ; + return (Cstruct.shift buf 4) +end + +module Int64 = Qcow_int64 +module Int = Qcow_int + +(* +module Cluster = struct + include Qcow_word_size.Cluster +end +*) +module Cluster = Qcow_int64 diff --git a/ocaml/qcow-tool/lib/qcow_types.mli b/ocaml/qcow-tool/lib/qcow_types.mli new file mode 100644 index 00000000000..60e795e777c --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_types.mli @@ -0,0 +1,65 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +(** Parsers and printers for types used in qcow2 fields *) + +open Sexplib + +val big_enough_for : string -> Cstruct.t -> int -> unit Qcow_error.t +(** [big_enough_for name buf length] returns an error with a log message + if buffer [buf] is smaller than [length]. The [name] will be included + in the error message. *) + +module Int8 : sig + type t = int [@@deriving sexp] + + include Qcow_s.SERIALISABLE with type t := t +end + +module Int16 : sig + type t = int [@@deriving sexp] + + include Qcow_s.SERIALISABLE with type t := t +end + +module Int32 : sig + include module type of Int32 + + val t_of_sexp : Sexp.t -> t + + val sexp_of_t : t -> Sexp.t + + include Qcow_s.SERIALISABLE with type t := t +end + +module Int64 : module type of Qcow_int64 + +module Int : module type of Qcow_int + +module Cluster : sig + type t [@@deriving sexp] + + include Qcow_s.NUM with type t := t + + val to_float : t -> float + + val round_up : t -> t -> t + (** [round_up value to] rounds [value] to the next multiple of [to] *) + + module IntervalSet : Qcow_s.INTERVAL_SET with type elt = t + + module Map : Map.S with type key = t +end diff --git a/ocaml/qcow-tool/lib/qcow_virtual.ml b/ocaml/qcow-tool/lib/qcow_virtual.ml new file mode 100644 index 00000000000..6dc80e742d2 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_virtual.ml @@ -0,0 +1,44 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) +open Sexplib.Std + +(* An address in a qcow image is broken into 3 levels: *) +type t = { + l1_index: int64 (* index in the L1 table *) + ; l2_index: int64 (* index in the L2 table *) + ; cluster: int64 (* index within the cluster *) +} +[@@deriving sexp] + +let ( <| ) = Int64.shift_left + +let ( |> ) = Int64.shift_right_logical + +let make ~cluster_bits x = + let l2_bits = cluster_bits - 3 in + let l1_index = x |> l2_bits + cluster_bits in + let l2_index = x <| 64 - l2_bits - cluster_bits |> 64 - l2_bits in + let cluster = x <| 64 - cluster_bits |> 64 - cluster_bits in + {l1_index; l2_index; cluster} + +let to_offset ~cluster_bits t = + let l2_bits = cluster_bits - 3 in + let l1_index = t.l1_index <| l2_bits + cluster_bits in + let l2_index = t.l2_index <| cluster_bits in + Int64.(logor (logor l1_index l2_index) t.cluster) + +let to_string t = Sexplib.Sexp.to_string (sexp_of_t t) diff --git a/ocaml/qcow-tool/lib/qcow_virtual.mli b/ocaml/qcow-tool/lib/qcow_virtual.mli new file mode 100644 index 00000000000..9d9e955463f --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_virtual.mli @@ -0,0 +1,37 @@ +(* + * Copyright (C) 2015 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +(** A virtual address in a qcow image is broken into 3 levels: + - an index in the L1 table, pointing to + - an index in the L2 table, pointing to + - a cluster within which we need an offset *) +type t = { + l1_index: int64 (* index in the L1 table *) + ; l2_index: int64 (* index in the L2 table *) + ; cluster: int64 (* index within the cluster *) +} +[@@deriving sexp] + +val make : cluster_bits:int -> int64 -> t +(** [make cluster_bits byte] computes the address within the file + of the virtual [byte] offset *) + +val to_offset : cluster_bits:int -> t -> int64 +(** [to_offset cluster_bits address] computes the virtual byte offset + of the virtual address *) + +include Qcow_s.PRINTABLE with type t := t diff --git a/ocaml/qcow-tool/lib/qcow_word_size.mli b/ocaml/qcow-tool/lib/qcow_word_size.mli new file mode 100644 index 00000000000..2d8f0256270 --- /dev/null +++ b/ocaml/qcow-tool/lib/qcow_word_size.mli @@ -0,0 +1,31 @@ +(* + * Copyright (C) 2017 David Scott + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + +(** Host system word size dependent types *) + +module Cluster : sig + type t [@@deriving sexp] + + include Qcow_s.NUM with type t := t + + val round_up : t -> t -> t + (** [round_up value to] rounds [value] to the next multiple of [to] *) + + module IntervalSet : Qcow_s.INTERVAL_SET with type elt = t + + module Map : Map.S with type key = t +end diff --git a/ocaml/qcow-tool/pkg/pkg.ml b/ocaml/qcow-tool/pkg/pkg.ml new file mode 100644 index 00000000000..f676f16f2a6 --- /dev/null +++ b/ocaml/qcow-tool/pkg/pkg.ml @@ -0,0 +1,5 @@ +#!/usr/bin/env ocaml + +#use "topfind" + +#require "topkg-jbuilder.auto" diff --git a/ocaml/vhd-tool/src/impl.ml b/ocaml/vhd-tool/src/impl.ml index d067846f565..a4c1f06dfaf 100644 --- a/ocaml/vhd-tool/src/impl.ml +++ b/ocaml/vhd-tool/src/impl.ml @@ -1319,7 +1319,7 @@ let serve common_options source source_fd source_format source_protocol protocol_of_string (require "source-protocol" source_protocol) in - let supported_formats = ["raw"; "vhd"] in + let supported_formats = ["raw"; "vhd"; "qcow2"] in if not (List.mem source_format supported_formats) then failwith (Printf.sprintf "%s is not a supported format" source_format) ; let supported_formats = ["raw"] in @@ -1357,7 +1357,9 @@ let serve common_options source source_fd source_format source_protocol endpoint_of_string source | Some fd -> return - (File_descr (Lwt_unix.of_unix_file_descr (file_descr_of_int fd))) + ( Printf.fprintf stderr "GTNDEBUG: source fd is %d" fd ; + File_descr (Lwt_unix.of_unix_file_descr (file_descr_of_int fd)) + ) ) >>= fun source_endpoint -> ( match source_endpoint with diff --git a/ocaml/xapi-consts/api_errors.ml b/ocaml/xapi-consts/api_errors.ml index 906e22bf259..6ce5ff9c4e7 100644 --- a/ocaml/xapi-consts/api_errors.ml +++ b/ocaml/xapi-consts/api_errors.ml @@ -1424,3 +1424,6 @@ let host_driver_no_hardware = add_error "HOST_DRIVER_NO_HARDWARE" let tls_verification_not_enabled_in_pool = add_error "TLS_VERIFICATION_NOT_ENABLED_IN_POOL" + +let unimplemented_in_qcow_tool_wrapper = + add_error "UNIMPLEMENTED_IN_QCOW_TOOL_WRAPPER" diff --git a/ocaml/xapi/common_tool_wrapper.ml b/ocaml/xapi/common_tool_wrapper.ml new file mode 100644 index 00000000000..c3033054c38 --- /dev/null +++ b/ocaml/xapi/common_tool_wrapper.ml @@ -0,0 +1,60 @@ +(* + * Copyright (C) 2025 Vates. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + *) + +open Xapi_stdext_std.Xstringext + +(** [find_backend_device path] returns [Some path'] where [path'] is the backend path in + the driver domain corresponding to the frontend device [path] in this domain. *) +let find_backend_device path = + try + let open Ezxenstore_core.Xenstore in + (* If we're looking at a xen frontend device, see if the backend + is in the same domain. If so check if it looks like a .vhd *) + let rdev = (Unix.stat path).Unix.st_rdev in + let major = rdev / 256 and minor = rdev mod 256 in + let link = + Unix.readlink (Printf.sprintf "/sys/dev/block/%d:%d/device" major minor) + in + match List.rev (String.split '/' link) with + | id :: "xen" :: "devices" :: _ + when Astring.String.is_prefix ~affix:"vbd-" id -> + let id = int_of_string (String.sub id 4 (String.length id - 4)) in + with_xs (fun xs -> + let self = xs.Xs.read "domid" in + let backend = + xs.Xs.read (Printf.sprintf "device/vbd/%d/backend" id) + in + let params = xs.Xs.read (Printf.sprintf "%s/params" backend) in + match String.split '/' backend with + | "local" :: "domain" :: bedomid :: _ -> + if not (self = bedomid) then + raise + Api_errors.( + Server_error + ( internal_error + , [ + Printf.sprintf + "find_backend_device: Got domid %s but expected \ + %s" + bedomid self + ] + ) + ) ; + Some params + | _ -> + raise Not_found + ) + | _ -> + raise Not_found + with _ -> None diff --git a/ocaml/xapi/export_raw_vdi.ml b/ocaml/xapi/export_raw_vdi.ml index cea32fb5533..5686608c666 100644 --- a/ocaml/xapi/export_raw_vdi.ml +++ b/ocaml/xapi/export_raw_vdi.ml @@ -47,11 +47,16 @@ let localhost_handler rpc session_id vdi (req : Http.Request.t) let copy base_path path size = try debug "Copying VDI contents..." ; - Vhd_tool_wrapper.send ?relative_to:base_path - (Vhd_tool_wrapper.update_task_progress __context) - "none" - (Importexport.Format.to_string format) - s path size "" ; + if format = Qcow then + Qcow_tool_wrapper.send + (Qcow_tool_wrapper.update_task_progress __context) + s path size + else + Vhd_tool_wrapper.send ?relative_to:base_path + (Vhd_tool_wrapper.update_task_progress __context) + "none" + (Importexport.Format.to_string format) + s path size "" ; debug "Copying VDI complete." with Unix.Unix_error (Unix.EIO, _, _) -> raise @@ -73,7 +78,7 @@ let localhost_handler rpc session_id vdi (req : Http.Request.t) in Http_svr.headers s headers ; match format with - | Raw | Vhd -> + | Qcow | Raw | Vhd -> let size = Db.VDI.get_virtual_size ~__context ~self:vdi in if format = Vhd && size > Constants.max_vhd_size then raise diff --git a/ocaml/xapi/import_raw_vdi.ml b/ocaml/xapi/import_raw_vdi.ml index 565c29e7d8e..234fc44f310 100644 --- a/ocaml/xapi/import_raw_vdi.ml +++ b/ocaml/xapi/import_raw_vdi.ml @@ -158,7 +158,7 @@ let localhost_handler rpc session_id vdi_opt (req : Request.t) in Http_svr.headers s headers ; ( match format with - | Raw | Vhd -> + | Raw | Vhd | Qcow -> let prezeroed = not (Sm_fs_ops.must_write_zeroes_into_new_vdi ~__context vdi) diff --git a/ocaml/xapi/importexport.ml b/ocaml/xapi/importexport.ml index a210bda04d6..6ba6769b7ef 100644 --- a/ocaml/xapi/importexport.ml +++ b/ocaml/xapi/importexport.ml @@ -430,9 +430,17 @@ let sr_of_req ~__context (req : Http.Request.t) = None module Format = struct - type t = Raw | Vhd | Tar + type t = Raw | Vhd | Tar | Qcow - let to_string = function Raw -> "raw" | Vhd -> "vhd" | Tar -> "tar" + let to_string = function + | Raw -> + "raw" + | Vhd -> + "vhd" + | Tar -> + "tar" + | Qcow -> + "qcow2" let of_string x = match String.lowercase_ascii x with @@ -442,6 +450,8 @@ module Format = struct Some Vhd | "tar" -> Some Tar + | "qcow2" -> + Some Qcow | _ -> None @@ -457,6 +467,8 @@ module Format = struct "application/vhd" | Tar -> "application/x-tar" + | Qcow -> + "application/x-qemu-disk" let _key = "format" diff --git a/ocaml/xapi/qcow_tool_wrapper.ml b/ocaml/xapi/qcow_tool_wrapper.ml new file mode 100644 index 00000000000..c6b7bf9da52 --- /dev/null +++ b/ocaml/xapi/qcow_tool_wrapper.ml @@ -0,0 +1,93 @@ +(* + * Copyright (C) 2025 Vates. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + *) + +module D = Debug.Make (struct let name = "qcow_tool_wrapper" end) + +open D + +let unimplemented () = + raise + (Api_errors.Server_error (Api_errors.unimplemented_in_qcow_tool_wrapper, [])) + +let run_qcow_tool (progress_cb : int -> unit) (args : string list) + (ufd : Unix.file_descr) = + let qcow_tool = !Xapi_globs.qcow_tool in + info "Executing %s %s" qcow_tool (String.concat " " args) ; + let open Forkhelpers in + let pipe_read, pipe_write = Unix.pipe () in + Xapi_stdext_pervasives.Pervasiveext.finally + (fun () -> + match + with_logfile_fd "qcow-tool" (fun log_fd -> + let ufd_str = Uuidx.(to_string (make ())) in + let pid = + safe_close_and_exec None (Some pipe_write) (Some log_fd) + [(ufd_str, ufd)] + qcow_tool args + in + let _, status = waitpid pid in + if status <> Unix.WEXITED 0 then ( + error "qcow-tool failed, returning VDI_IO_ERROR" ; + raise + (Api_errors.Server_error + (Api_errors.vdi_io_error, ["Device I/O errors"]) + ) + ) + ) + with + | Success (out, _) -> + debug "%s" out + | Failure (out, e) -> + error "qcow-tool output: %s" out ; + raise e + ) + (fun () -> List.iter Unix.close [pipe_read; pipe_write]) + +let update_task_progress (__context : Context.t) (x : int) = + TaskHelper.set_progress ~__context (float_of_int x /. 100.) + +let qcow_of_device path = + let tapdisk_of_path path = + try + match Tapctl.of_device (Tapctl.create ()) path with + | _, str, Some (_, qcow) -> + debug "Found str %s and file %s" str qcow ; + Some qcow + | _ -> + None + with Not_found -> + debug "Device %s has an unknown driver" path ; + None + in + Common_tool_wrapper.find_backend_device path + |> Option.value ~default:path + |> tapdisk_of_path + +let send (progress_cb : int -> unit) (unix_fd : Unix.file_descr) (path : string) + (size : Int64.t) = + debug "Qcow send called with a size of %Ld and path equal to %s" size path ; + let _, source = + match (Stream_vdi.get_nbd_device path, qcow_of_device path) with + | Some (nbd_path, exportname), Some p -> + debug "get_nbd_device (path=%s, exportname=%s), p = %s" nbd_path + exportname p ; + (nbd_path, exportname) + | None, Some p -> + debug "nbd device not found but p = %s" p ; + ("gtn_no_nbd", p) + | _ -> + ("gtn_unknown", "gtn_unknown") + in + let args = ["stream"; "--source"; source; path] in + run_qcow_tool progress_cb args unix_fd diff --git a/ocaml/xapi/vhd_tool_wrapper.ml b/ocaml/xapi/vhd_tool_wrapper.ml index 73f25785eb8..be21d97352a 100644 --- a/ocaml/xapi/vhd_tool_wrapper.ml +++ b/ocaml/xapi/vhd_tool_wrapper.ml @@ -113,42 +113,6 @@ let receive progress_cb format protocol (s : Unix.file_descr) in run_vhd_tool progress_cb args s s' path -(** [find_backend_device path] returns [Some path'] where [path'] is the backend path in - the driver domain corresponding to the frontend device [path] in this domain. *) -let find_backend_device path = - try - let open Ezxenstore_core.Xenstore in - (* If we're looking at a xen frontend device, see if the backend - is in the same domain. If so check if it looks like a .vhd *) - let rdev = (Unix.stat path).Unix.st_rdev in - let major = rdev / 256 and minor = rdev mod 256 in - let link = - Unix.readlink (Printf.sprintf "/sys/dev/block/%d:%d/device" major minor) - in - match List.rev (String.split '/' link) with - | id :: "xen" :: "devices" :: _ - when Astring.String.is_prefix ~affix:"vbd-" id -> - let id = int_of_string (String.sub id 4 (String.length id - 4)) in - with_xs (fun xs -> - let self = xs.Xs.read "domid" in - let backend = - xs.Xs.read (Printf.sprintf "device/vbd/%d/backend" id) - in - let params = xs.Xs.read (Printf.sprintf "%s/params" backend) in - match String.split '/' backend with - | "local" :: "domain" :: bedomid :: _ -> - if not (self = bedomid) then - Helpers.internal_error - "find_backend_device: Got domid %s but expected %s" bedomid - self ; - Some params - | _ -> - raise Not_found - ) - | _ -> - raise Not_found - with _ -> None - (** [vhd_of_device path] returns (Some vhd) where 'vhd' is the vhd leaf backing a particular device [path] or None. [path] may either be a blktap2 device *or* a blkfront device backed by a blktap2 device. If the latter then the script must be run in the same domain as blkback. *) @@ -178,20 +142,27 @@ let vhd_of_device path = debug "Device %s has an unknown driver" path ; None in - find_backend_device path |> Option.value ~default:path |> tapdisk_of_path + Common_tool_wrapper.find_backend_device path + |> Option.value ~default:path + |> tapdisk_of_path let send progress_cb ?relative_to (protocol : string) (dest_format : string) (s : Unix.file_descr) (path : string) (size : Int64.t) (prefix : string) = let s' = Uuidx.(to_string (make ())) in + debug "GTNDEBUG: path is %s" path ; + debug "GTNDEBUG: prefix is %s" prefix ; let source_format, source = match (Stream_vdi.get_nbd_device path, vhd_of_device path, relative_to) with | Some (nbd_server, exportname), _, None -> + debug "GTNDEBUG: nbdhybrid %s:%s:%s:%Ld" path nbd_server exportname size ; ( "nbdhybrid" , Printf.sprintf "%s:%s:%s:%Ld" path nbd_server exportname size ) | Some _, Some vhd, Some _ | None, Some vhd, _ -> + debug "GTNDEBUG: hybrid %s" (path ^ ":" ^ vhd) ; ("hybrid", path ^ ":" ^ vhd) | None, None, None -> + debug "GTNDEBUG: raw %s" path ; ("raw", path) | _, None, Some _ -> let msg = "Cannot compute differences on non-VHD images" in diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index 89665a13494..61845fc7bd2 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -806,6 +806,8 @@ let sparse_dd = ref "sparse_dd" let vhd_tool = ref "vhd-tool" +let qcow_tool = ref "qcow-tool" + let fence = ref "fence" let host_bugreport_upload = ref "host-bugreport-upload" @@ -1769,6 +1771,7 @@ module Resources = struct ) ; ("sparse_dd", sparse_dd, "Path to sparse_dd") ; ("vhd-tool", vhd_tool, "Path to vhd-tool") + ; ("qcow-tool", qcow_tool, "Path to qcow-tool") ; ("fence", fence, "Path to fence binary, used for HA host fencing") ; ( "host-bugreport-upload" , host_bugreport_upload diff --git a/qcow-tool.opam b/qcow-tool.opam new file mode 100644 index 00000000000..e4cf234665f --- /dev/null +++ b/qcow-tool.opam @@ -0,0 +1,48 @@ +# This file is generated by dune, edit dune-project instead +opam-version: "2.0" +synopsis: "Manipulate .qcow files" +maintainer: ["Xapi project maintainers"] +authors: ["xen-api@lists.xen.org"] +license: "LGPL-2.1-only WITH OCaml-LGPL-linking-exception" +tags: ["org.mirage" "org:xapi-project"] +homepage: "https://xapi-project.github.io/" +bug-reports: "https://github.com/xapi-project/xen-api/issues" +depends: [ + "dune" {>= "3.15"} + "asetmap" + "astring" + "cmdliner" + "cstruct" + "ezjsonm" + "fmt" + "io-page" + "logs" + "lwt" + "mirage-block" + "mirage-block-combinators" + "mirage-block-unix" + "mirage-time" + "mirage-types-lwt" + "ounit" + "prometheus" + "result" + "sexplib" + "sha" + "unix-type-representations" + "odoc" {with-doc} +] +build: [ + ["dune" "subst"] {dev} + [ + "dune" + "build" + "-p" + name + "-j" + jobs + "@install" + "@runtest" {with-test} + "@doc" {with-doc} + ] +] +dev-repo: "git+https://github.com/xapi-project/xen-api.git"