Revert "tools: remove blktap2 source code"
authorWei Liu <wei.liu2@citrix.com>
Thu, 8 Sep 2016 15:15:47 +0000 (16:15 +0100)
committerWei Liu <wei.liu2@citrix.com>
Thu, 8 Sep 2016 15:15:47 +0000 (16:15 +0100)
This reverts commit 44b2829a8b97a8b04e063a93303dbe3a468642e3.

126 files changed:
tools/blktap2/Makefile [new file with mode: 0644]
tools/blktap2/README [new file with mode: 0644]
tools/blktap2/control/Makefile [new file with mode: 0644]
tools/blktap2/control/tap-ctl-allocate.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-attach.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-check.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-close.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-create.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-destroy.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-detach.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-free.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-ipc.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-list.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-major.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-open.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-pause.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-spawn.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl-unpause.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl.c [new file with mode: 0644]
tools/blktap2/control/tap-ctl.h [new file with mode: 0644]
tools/blktap2/drivers/Makefile [new file with mode: 0644]
tools/blktap2/drivers/aes.c [new file with mode: 0644]
tools/blktap2/drivers/aes.h [new file with mode: 0644]
tools/blktap2/drivers/atomicio.c [new file with mode: 0644]
tools/blktap2/drivers/blk.h [new file with mode: 0644]
tools/blktap2/drivers/blk_linux.c [new file with mode: 0644]
tools/blktap2/drivers/blk_netbsd.c [new file with mode: 0644]
tools/blktap2/drivers/block-aio.c [new file with mode: 0644]
tools/blktap2/drivers/block-cache.c [new file with mode: 0644]
tools/blktap2/drivers/block-log.c [new file with mode: 0644]
tools/blktap2/drivers/block-qcow.c [new file with mode: 0644]
tools/blktap2/drivers/block-ram.c [new file with mode: 0644]
tools/blktap2/drivers/block-remus.c [new file with mode: 0644]
tools/blktap2/drivers/block-vhd.c [new file with mode: 0644]
tools/blktap2/drivers/bswap.h [new file with mode: 0644]
tools/blktap2/drivers/check_gcrypt [new file with mode: 0644]
tools/blktap2/drivers/hashtable.c [new file with mode: 0644]
tools/blktap2/drivers/hashtable.h [new file with mode: 0644]
tools/blktap2/drivers/hashtable_itr.c [new file with mode: 0644]
tools/blktap2/drivers/hashtable_itr.h [new file with mode: 0644]
tools/blktap2/drivers/hashtable_private.h [new file with mode: 0644]
tools/blktap2/drivers/hashtable_utility.c [new file with mode: 0644]
tools/blktap2/drivers/hashtable_utility.h [new file with mode: 0644]
tools/blktap2/drivers/img2qcow.c [new file with mode: 0644]
tools/blktap2/drivers/io-optimize.c [new file with mode: 0644]
tools/blktap2/drivers/io-optimize.h [new file with mode: 0644]
tools/blktap2/drivers/libaio-compat.h [new file with mode: 0644]
tools/blktap2/drivers/lock.c [new file with mode: 0644]
tools/blktap2/drivers/lock.h [new file with mode: 0644]
tools/blktap2/drivers/log.h [new file with mode: 0644]
tools/blktap2/drivers/md5.c [new file with mode: 0644]
tools/blktap2/drivers/md5.h [new file with mode: 0644]
tools/blktap2/drivers/profile.h [new file with mode: 0644]
tools/blktap2/drivers/qcow-create.c [new file with mode: 0644]
tools/blktap2/drivers/qcow.h [new file with mode: 0644]
tools/blktap2/drivers/qcow2raw.c [new file with mode: 0644]
tools/blktap2/drivers/scheduler.c [new file with mode: 0644]
tools/blktap2/drivers/scheduler.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-client.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-control.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-control.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-diff.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-disktype.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-disktype.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-driver.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-driver.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-filter.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-filter.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-image.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-image.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-interface.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-interface.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-log.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-log.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-queue.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-queue.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-ring.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-ring.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-server.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-server.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-stream.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-utils.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-utils.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-vbd.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-vbd.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk2.c [new file with mode: 0644]
tools/blktap2/drivers/td.c [new file with mode: 0644]
tools/blktap2/drivers/xmsnap [new file with mode: 0644]
tools/blktap2/include/Makefile [new file with mode: 0644]
tools/blktap2/include/atomicio.h [new file with mode: 0644]
tools/blktap2/include/blktap2.h [new file with mode: 0644]
tools/blktap2/include/blktaplib.h [new file with mode: 0644]
tools/blktap2/include/libvhd-journal.h [new file with mode: 0644]
tools/blktap2/include/libvhd.h [new file with mode: 0644]
tools/blktap2/include/list.h [new file with mode: 0644]
tools/blktap2/include/lvm-util.h [new file with mode: 0644]
tools/blktap2/include/relative-path.h [new file with mode: 0644]
tools/blktap2/include/tapdisk-message.h [new file with mode: 0644]
tools/blktap2/include/vhd-util.h [new file with mode: 0644]
tools/blktap2/include/vhd-uuid.h [new file with mode: 0644]
tools/blktap2/include/vhd.h [new file with mode: 0644]
tools/blktap2/lvm/Makefile [new file with mode: 0644]
tools/blktap2/lvm/lvm-util.c [new file with mode: 0644]
tools/blktap2/vhd/Makefile [new file with mode: 0644]
tools/blktap2/vhd/lib/Makefile [new file with mode: 0644]
tools/blktap2/vhd/lib/atomicio.c [new file with mode: 0644]
tools/blktap2/vhd/lib/libvhd-journal.c [new file with mode: 0644]
tools/blktap2/vhd/lib/libvhd.c [new file with mode: 0644]
tools/blktap2/vhd/lib/relative-path.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-check.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-coalesce.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-create.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-fill.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-modify.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-query.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-read.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-repair.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-resize.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-revert.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-scan.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-set-field.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-snapshot.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-uuid.c [new file with mode: 0644]
tools/blktap2/vhd/vhd-update.c [new file with mode: 0644]
tools/blktap2/vhd/vhd-util.c [new file with mode: 0644]

diff --git a/tools/blktap2/Makefile b/tools/blktap2/Makefile
new file mode 100644 (file)
index 0000000..94200dc
--- /dev/null
@@ -0,0 +1,20 @@
+XEN_ROOT = $(CURDIR)/../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS  += $(CFLAGS_libxenctrl)
+LDLIBS += $(LDLIBS_libxenctrl)
+
+SUBDIRS-y :=
+SUBDIRS-y += include
+SUBDIRS-y += lvm
+SUBDIRS-y += vhd
+SUBDIRS-$(CONFIG_Linux) += drivers
+SUBDIRS-$(CONFIG_Linux) += control
+
+clean:
+       rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) TAGS
+
+distclean: clean
+
+.PHONY: all clean install distclean
+all clean install distclean: %: subdirs-%
diff --git a/tools/blktap2/README b/tools/blktap2/README
new file mode 100644 (file)
index 0000000..75fc614
--- /dev/null
@@ -0,0 +1,321 @@
+Blktap2 Userspace Tools + Library
+================================
+
+Dutch Meyer
+4th June 2009
+
+Andrew Warfield and Julian Chesterfield
+16th June 2006
+
+
+The blktap2 userspace toolkit provides a user-level disk I/O
+interface. The blktap2 mechanism involves a kernel driver that acts
+similarly to the existing Xen/Linux blkback driver, and a set of
+associated user-level libraries.  Using these tools, blktap2 allows
+virtual block devices presented to VMs to be implemented in userspace
+and to be backed by raw partitions, files, network, etc.
+
+The key benefit of blktap2 is that it makes it easy and fast to write
+arbitrary block backends, and that these user-level backends actually
+perform very well.  Specifically:
+
+- Metadata disk formats such as Copy-on-Write, encrypted disks, sparse
+  formats and other compression features can be easily implemented.
+
+- Accessing file-based images from userspace avoids problems related
+  to flushing dirty pages which are present in the Linux loopback
+  driver.  (Specifically, doing a large number of writes to an
+  NFS-backed image don't result in the OOM killer going berserk.)
+
+- Per-disk handler processes enable easier userspace policing of block
+  resources, and process-granularity QoS techniques (disk scheduling
+  and related tools) may be trivially applied to block devices.
+
+- It's very easy to take advantage of userspace facilities such as
+  networking libraries, compression utilities, peer-to-peer
+  file-sharing systems and so on to build more complex block backends.
+
+- Crashes are contained -- incremental development/debugging is very
+  fast.
+
+How it works (in one paragraph):
+
+Working in conjunction with the kernel blktap2 driver, all disk I/O
+requests from VMs are passed to the userspace deamon (using a shared
+memory interface) through a character device. Each active disk is
+mapped to an individual device node, allowing per-disk processes to
+implement individual block devices where desired.  The userspace
+drivers are implemented using asynchronous (Linux libaio),
+O_DIRECT-based calls to preserve the unbuffered, batched and
+asynchronous request dispatch achieved with the existing blkback
+code.  We provide a simple, asynchronous virtual disk interface that
+makes it quite easy to add new disk implementations.
+
+As of June 2009 the current supported disk formats are:
+
+ - Raw Images (both on partitions and in image files)
+ - Fast sharable RAM disk between VMs (requires some form of 
+   cluster-based filesystem support e.g. OCFS2 in the guest kernel)
+ - VHD, including snapshots and sparse images
+ - Qcow, including snapshots and sparse images
+
+
+Build and Installation Instructions
+===================================
+
+Make to configure the blktap2 backend driver in your dom0 kernel.  It
+will inter-operate with the existing backend and frontend drivers.  It
+will also cohabitate with the original blktap driver.  However, some
+formats (currently aio and qcow) will default to their blktap2
+versions when specified in a vm configuration file.
+
+To build the tools separately, "make && make install" in
+tools/blktap2.
+
+
+Using the Tools
+===============
+
+Preparing an image for boot:
+
+The userspace disk agent is configured to start automatically via xend
+
+Customize the VM config file to use the 'tap:tapdisk' handler,
+followed by the driver type. e.g. for a raw image such as a file or
+partition:
+
+disk = ['tap:tapdisk:aio:<FILENAME>,sda1,w']
+
+Alternatively, the vhd-util tool (installed with make install, or in
+/blktap2/vhd) can be used to build sparse copy-on-write vhd images.
+
+For example, to build a sparse image -
+  vhd-util create -n MyVHDFile -s 1024
+
+This creates a sparse 1GB file named "MyVHDFile" that can be mounted
+and populated with data.
+
+One can also base the image on a raw file -
+  vhd-util snapshot -n MyVHDFile -p SomeRawFile -m
+
+This creates a sparse VHD file named "MyVHDFile" using "SomeRawFile"
+as a parent image.  Copy-on-write semantics ensure that writes will be
+stored in "MyVHDFile" while reads will be directed to the most
+recently written version of the data, either in "MyVHDFile" or
+"SomeRawFile" as is appropriate.  Other options exist as well, consult
+the vhd-util application for the complete set of VHD tools.
+
+VHD files can be mounted automatically in a guest similarly to the
+above AIO example simply by specifying the vhd driver.
+
+disk = ['tap:tapdisk:vhd:<VHD FILENAME>,sda1,w']
+
+
+Snapshots:
+
+Pausing a guest will also plug the corresponding IO queue for blktap2
+devices and stop blktap2 drivers.  This can be used to implement a
+safe live snapshot of qcow and vhd disks.  An example script "xmsnap"
+is shown in the tools/blktap2/drivers directory.  This script will
+perform a live snapshot of a qcow disk.  VHD files can use the
+"vhd-util snapshot" tool discussed above.  If this snapshot command is
+applied to a raw file mounted with tap:tapdisk:AIO, include the -m
+flag and the driver will be reloaded as VHD.  If applied to an already
+mounted VHD file, omit the -m flag.
+
+
+Mounting images in Dom0 using the blktap2 driver
+===============================================
+Tap (and blkback) disks are also mountable in Dom0 without requiring an
+active VM to attach. 
+
+The syntax is -
+  tapdisk2 -n <type>:<full path to file>
+
+For example -
+  tapdisk2  -n aio:/home/images/rawFile.img
+
+When successful the location of the new device will be provided by
+tapdisk2 to stdout and tapdisk2 will terminate.  From that point
+forward control of the device is provided through sysfs in the
+directory-
+
+  /sys/class/blktap2/blktap#/
+
+Where # is a blktap2 device number present in the path that tapdisk2
+printed before terminating.  The sysfs interface is largely intuitive,
+for example, to remove tap device 0 one would-
+  
+  echo 1 > /sys/class/blktap2/blktap0/remove
+
+Similarly, a pause control is available, which is can be used to plug
+the request queue of a live running guest.
+
+Previous versions of blktap mounted devices in dom0 by using blkfront
+in dom0 and the xm block-attach command.  This approach is still
+available, though slightly more cumbersome.
+
+
+Tapdisk Development
+===============================================
+
+People regularly ask how to develop their own tapdisk drivers, and
+while it has not yet been well documented, the process is relatively
+easy.  Here I will provide a brief overview.  The best reference, of
+course, comes from the existing drivers.  Specifically,
+blktap2/drivers/block-ram.c and blktap2/drivers/block-aio.c provide
+the clearest examples of simple drivers.
+
+Setup:
+
+First you need to register your new driver with blktap. This is done
+in disktypes.h.  There are five things that you must do.  To
+demonstrate, I will create a disk called "mynewdisk", you can name
+yours freely.
+
+1) Forward declare an instance of struct tap_disk.
+
+e.g. -  
+  extern struct tap_disk tapdisk_mynewdisk;
+
+2) Claim one of the unused disk type numbers, take care to observe the
+MAX_DISK_TYPES macro, increasing the number if necessary.
+
+e.g. -
+  #define DISK_TYPE_MYNEWDISK         10
+
+3) Create an instance of disk_info_t.  The bulk of this file contains examples of these.
+
+e.g. -
+  static disk_info_t mynewdisk_disk = {
+          DISK_TYPE_MYNEWDISK,
+          "My New Disk (mynewdisk)",
+          "mynewdisk",
+          0,
+  #ifdef TAPDISK
+          &tapdisk_mynewdisk,
+  #endif
+  };
+
+A few words about what these mean.  The first field must be the disk
+type number you claimed in step (2).  The second field is a string
+describing your disk, and may contain any relevant info.  The third
+field is the name of your disk as will be used by the tapdisk2 utility
+and xend (for example tapdisk2 -n mynewdisk:/path/to/disk.image, or in
+your xm create config file).  The forth is binary and determines
+whether you will have one instance of your driver, or many.  Here, a 1
+means that your driver is a singleton and will coordinate access to
+any number of tap devices.  0 is more common, meaning that you will
+have one driver for each device that is created.  The final field
+should contain a reference to the struct tap_disk you created in step
+(1).
+
+4) Add a reference to your disk info structure (from step (3)) to the
+dtypes array.  Take care here - you need to place it in the position
+corresponding to the device type number you claimed in step (2).  So
+we would place &mynewdisk_disk in dtypes[10].  Look at the other
+devices in this array and pad with "&null_disk," as necessary.
+
+5) Modify the xend python scripts.  You need to add your disk name to
+the list of disks that xend recognizes.
+
+edit:
+  tools/python/xen/xend/server/BlktapController.py
+
+And add your disk to the "blktap_disk_types" array near the top of
+your file.  Use the same name you specified in the third field of step
+(3).  The order of this list is not important.
+
+
+Now your driver is ready to be written.  Create a block-mynewdisk.c in
+tools/blktap2/drivers and add it to the Makefile.
+
+
+Development:
+
+Copying block-aio.c and block-ram.c would be a good place to start.
+Read those files as you go through this, I will be assisting by
+commenting on a few useful functions and structures.
+
+struct tap_disk:
+
+Remember the forward declaration in step (1) of the setup phase above?
+Now is the time to make that structure a reality.  This structure
+contains a list of function pointers for all the routines that will be
+asked of your driver.  Currently the required functions are open,
+close, read, write, get_parent_id, validate_parent, and debug.
+
+e.g. -
+  struct tap_disk tapdisk_mynewdisk = {
+          .disk_type          = "tapdisk_mynewdisk",
+          .flags              = 0,
+          .private_data_size  = sizeof(struct tdmynewdisk_state),
+          .td_open            = tdmynewdisk_open,
+                 ....
+
+The private_data_size field is used to provide a structure to store
+the state of your device.  It is very likely that you will want
+something here, but you are free to design whatever structure you
+want.  Blktap will allocate this space for you, you just need to tell
+it how much space you want.
+
+
+tdmynewdisk_open:
+
+This is the open routine.  The first argument is a structure
+representing your driver.  Two fields in this array are
+interesting. 
+
+driver->data will contain a block of memory of the size your requested
+in in the .private_data_size field of your struct tap_disk (above).
+
+driver->info contains a structure that details information about your
+disk.  You need to fill this out.  By convention this is done with a
+_get_image_info() function.  Assign a size (the total number of
+sectors), sector_size (the size of each sector in bytes, and set
+driver->info->info to 0.
+
+The second parameter contains the name that was specified in the
+creation of your device, either through xend, or on the command line
+with tapdisk2.  Usually this specifies a file that you will open in
+this routine.  The final parameter, flags, contains one of a number of
+flags specified in tapdisk.h that may change the way you treat the
+disk.
+
+
+_queue_read/write:
+
+These are your read and write operations.  What you do here will
+depend on your disk, but you should do exactly one of- 
+
+1) call td_complete_request with either error or success code.
+
+2) Call td_forward_request, which will forward the request to the next
+driver in the stack.
+
+3) Queue the request for asynchronous processing with
+td_prep_read/write.  In doing so, you will also register a callback
+for request completion.  When the request completes you must do one of
+options (1) or (2) above.  Finally, call td_queue_tiocb to submit the
+request to a wait queue.
+
+The above functions are defined in tapdisk-interface.c.  If you don't
+use them as specified you will run into problems as your driver will
+fail to inform blktap of the state of requests that have been
+submitted.  Blktap keeps track of all requests and does not like losing track.
+
+
+_close, _get_parent_id, _validate_parent:
+
+These last few tend to be very routine.  _close is called when the
+device is closed, and also when it is paused (in this case, open will
+also be called later).  The other functions are used in stacking
+drivers.  Most often drivers will return TD_NO_PARENT and -EINVAL,
+respectively.
+
+
+
+
+
+
diff --git a/tools/blktap2/control/Makefile b/tools/blktap2/control/Makefile
new file mode 100644 (file)
index 0000000..767f52a
--- /dev/null
@@ -0,0 +1,80 @@
+XEN_ROOT := $(CURDIR)/../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+MAJOR              = 1.0
+MINOR              = 0
+LIBNAME            = libblktapctl
+LIBSONAME          = $(LIBNAME).so.$(MAJOR)
+
+IBIN               = tap-ctl
+
+CFLAGS            += -Werror
+CFLAGS            += -Wno-unused
+CFLAGS            += -I../include -I../drivers
+CFLAGS            += $(CFLAGS_xeninclude)
+CFLAGS            += $(CFLAGS_libxenctrl)
+CFLAGS            += -D_GNU_SOURCE
+CFLAGS            += -DTAPCTL
+
+CTL_OBJS  := tap-ctl-ipc.o
+CTL_OBJS  += tap-ctl-list.o
+CTL_OBJS  += tap-ctl-allocate.o
+CTL_OBJS  += tap-ctl-free.o
+CTL_OBJS  += tap-ctl-create.o
+CTL_OBJS  += tap-ctl-destroy.o
+CTL_OBJS  += tap-ctl-spawn.o
+CTL_OBJS  += tap-ctl-attach.o
+CTL_OBJS  += tap-ctl-detach.o
+CTL_OBJS  += tap-ctl-open.o
+CTL_OBJS  += tap-ctl-close.o
+CTL_OBJS  += tap-ctl-pause.o
+CTL_OBJS  += tap-ctl-unpause.o
+CTL_OBJS  += tap-ctl-major.o
+CTL_OBJS  += tap-ctl-check.o
+
+CTL_PICS  = $(patsubst %.o,%.opic,$(CTL_OBJS))
+
+OBJS = $(CTL_OBJS) tap-ctl.o
+PICS = $(CTL_PICS)
+
+LIB_STATIC = $(LIBNAME).a
+LIB_SHARED = $(LIBSONAME).$(MINOR)
+IBIN = tap-ctl
+
+all: build
+
+build: $(IBIN) $(LIB_STATIC) $(LIB_SHARED)
+
+$(LIBNAME).so: $(LIBSONAME)
+       ln -sf $< $@
+
+$(LIBSONAME): $(LIB_SHARED)
+       ln -sf $< $@
+
+tap-ctl: tap-ctl.o $(LIBNAME).so
+       $(CC) $(LDFLAGS) -o $@ $^ $(APPEND_LDFLAGS)
+
+$(LIB_STATIC): $(CTL_OBJS)
+       $(AR) r $@ $^
+
+$(LIB_SHARED): $(CTL_PICS)
+       $(CC) $(LDFLAGS) -fPIC  -Wl,$(SONAME_LDFLAG) -Wl,$(LIBSONAME) $(SHLIB_LDFLAGS) -rdynamic $^ -o $@  $(APPEND_LDFLAGS)
+
+install: $(IBIN) $(LIB_STATIC) $(LIB_SHARED)
+       $(INSTALL_DIR) -p $(DESTDIR)$(sbindir)
+       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(sbindir)
+       $(INSTALL_DATA) $(LIB_STATIC) $(DESTDIR)$(libdir)
+       $(INSTALL_PROG) $(LIB_SHARED) $(DESTDIR)$(libdir)
+       ln -sf $(LIBSONAME) $(DESTDIR)$(libdir)/$(LIBNAME).so
+       ln -sf $(LIB_SHARED) $(DESTDIR)$(libdir)/$(LIBSONAME)
+
+clean:
+       rm -f $(OBJS) $(PICS) $(DEPS) $(IBIN) $(LIB_STATIC) $(LIB_SHARED)
+       rm -f $(LIBNAME).so $(LIBSONAME)
+       rm -f *~
+
+distclean: clean
+
+.PHONY: all build clean distclean install
+
+-include $(DEPS)
diff --git a/tools/blktap2/control/tap-ctl-allocate.c b/tools/blktap2/control/tap-ctl-allocate.c
new file mode 100644 (file)
index 0000000..8a6471e
--- /dev/null
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <linux/major.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+static int
+tap_ctl_prepare_directory(const char *dir)
+{
+       int err;
+       char *ptr, *name, *start;
+
+       err = access(dir, W_OK | R_OK);
+       if (!err)
+               return 0;
+
+       name = strdup(dir);
+       if (!name)
+               return ENOMEM;
+
+       start = name;
+
+       for (;;) {
+               ptr = strchr(start + 1, '/');
+               if (ptr)
+                       *ptr = '\0';
+
+               err = mkdir(name, 0755);
+               if (err && errno != EEXIST) {
+                       PERROR("mkdir %s", name);
+                       err = errno;
+                       break;
+               }
+
+               if (!ptr)
+                       break;
+               else {
+                       *ptr = '/';
+                       start = ptr + 1;
+               }
+       }
+
+       free(name);
+       return err;
+}
+
+static int
+tap_ctl_make_device(const char *devname, const int major,
+                   const int minor, const int perm)
+{
+       int err;
+       char *copy, *dir;
+
+       copy = strdup(devname);
+       if (!copy)
+               return ENOMEM;
+
+       dir = dirname(copy);
+
+       err = tap_ctl_prepare_directory(dir);
+       free(copy);
+
+       if (err)
+               return err;
+
+       if (!access(devname, F_OK))
+               if (unlink(devname)) {
+                       PERROR("unlink %s", devname);
+                       return errno;
+               }
+
+       err = mknod(devname, perm, makedev(major, minor));
+       if (err) {
+               PERROR("mknod %s", devname);
+               return errno;
+       }
+
+       return 0;
+}
+
+static int
+tap_ctl_check_environment(void)
+{
+       FILE *f;
+       int err, minor;
+       char name[256];
+
+       err = tap_ctl_prepare_directory(BLKTAP2_CONTROL_DIR);
+       if (err)
+               return err;
+
+       if (!access(BLKTAP2_CONTROL_DEVICE, R_OK | W_OK))
+               return 0;
+
+       memset(name, 0, sizeof(name));
+
+       f = fopen("/proc/misc", "r");
+       if (!f) {
+               EPRINTF("failed to open /proc/misc: %d\n", errno);
+               return errno;
+       }
+
+       while (fscanf(f, "%d %256s", &minor, name) == 2)
+               if (!strcmp(name, BLKTAP2_CONTROL_NAME)) {
+                       err = tap_ctl_make_device(BLKTAP2_CONTROL_DEVICE,
+                                                 MISC_MAJOR,
+                                                 minor, S_IFCHR | 0600);
+                       goto out;
+               }
+
+       err = ENOSYS;
+       EPRINTF("didn't find %s in /proc/misc\n", BLKTAP2_CONTROL_NAME);
+
+out:
+       fclose(f);
+       return err;
+}
+
+static int
+tap_ctl_allocate_device(int *minor, char **devname)
+{
+       char *name;
+       int fd, err;
+       struct blktap2_handle handle;
+
+       *minor = -1;
+       if (!devname)
+               return EINVAL;
+
+       fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY);
+       if (fd == -1) {
+               EPRINTF("failed to open control device: %d\n", errno);
+               return errno;
+       }
+
+       err = ioctl(fd, BLKTAP2_IOCTL_ALLOC_TAP, &handle);
+       close(fd);
+       if (err == -1) {
+               EPRINTF("failed to allocate new device: %d\n", errno);
+               return errno;
+       }
+
+       err = asprintf(&name, "%s%d", BLKTAP2_RING_DEVICE, handle.minor);
+       if (err == -1) {
+               err = ENOMEM;
+               goto fail;
+       }
+
+       err = tap_ctl_make_device(name, handle.ring,
+                                 handle.minor, S_IFCHR | 0600);
+       free(name);
+       if (err) {
+               EPRINTF("creating ring device for %d failed: %d\n",
+                       handle.minor, err);
+               goto fail;
+       }
+
+       if (*devname)
+               name = *devname;
+       else {
+               err = asprintf(&name, "%s%d",
+                              BLKTAP2_IO_DEVICE, handle.minor);
+               if (err == -1) {
+                       err = ENOMEM;
+                       goto fail;
+               }
+               *devname = name;
+       }
+
+       err = tap_ctl_make_device(name, handle.device,
+                                 handle.minor, S_IFBLK | 0600);
+       if (err) {
+               EPRINTF("creating IO device for %d failed: %d\n",
+                       handle.minor, err);
+               goto fail;
+       }
+
+       DBG("new interface: ring: %u, device: %u, minor: %u\n",
+           handle.ring, handle.device, handle.minor);
+
+       *minor = handle.minor;
+       return 0;
+
+fail:
+       tap_ctl_free(handle.minor);
+       return err;
+}
+
+int
+tap_ctl_allocate(int *minor, char **devname)
+{
+       int err;
+
+       *minor = -1;
+
+       err = tap_ctl_check_environment();
+       if (err)
+               return err;
+
+       err = tap_ctl_allocate_device(minor, devname);
+       if (err)
+               return err;
+
+       return 0;
+}
diff --git a/tools/blktap2/control/tap-ctl-attach.c b/tools/blktap2/control/tap-ctl-attach.c
new file mode 100644 (file)
index 0000000..3cb933c
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_attach(const int id, const int minor)
+{
+       int err;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(message));
+       message.type = TAPDISK_MESSAGE_ATTACH;
+       message.cookie = minor;
+
+       err = tap_ctl_connect_send_and_receive(id, &message, 5);
+       if (err)
+               return err;
+
+       if (message.type == TAPDISK_MESSAGE_ATTACH_RSP) {
+               err = message.u.response.error;
+               if (err)
+                       EPRINTF("attach failed: %d\n", err);
+       } else {
+               EPRINTF("got unexpected result '%s' from %d\n",
+                       tapdisk_message_name(message.type), id);
+               err = EINVAL;
+       }
+
+       return err;
+}
diff --git a/tools/blktap2/control/tap-ctl-check.c b/tools/blktap2/control/tap-ctl-check.c
new file mode 100644 (file)
index 0000000..e98583a
--- /dev/null
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+int
+tap_ctl_check_blktap(const char **msg)
+{
+       FILE *f;
+       int err = 0, minor;
+       char name[32];
+
+       memset(name, 0, sizeof(name));
+
+       f = fopen("/proc/misc", "r");
+       if (!f) {
+               *msg = "failed to open /proc/misc";
+               return -errno;
+       }
+
+       while (fscanf(f, "%d %32s", &minor, name) == 2) {
+               if (!strcmp(name, BLKTAP2_CONTROL_NAME))
+                       goto out;
+       }
+
+       err = -ENOSYS;
+       *msg = "blktap kernel module not installed";
+
+out:
+       fclose(f);
+       return err;
+}
+
+int
+tap_ctl_check(const char **msg)
+{
+       int err;
+       uid_t uid;
+
+       err = tap_ctl_check_blktap(msg);
+       if (err)
+               goto out;
+
+       err  = 0;
+       *msg = "ok";
+
+out:
+       return err;
+}
diff --git a/tools/blktap2/control/tap-ctl-close.c b/tools/blktap2/control/tap-ctl-close.c
new file mode 100644 (file)
index 0000000..2e5f80b
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+
+static int
+__tap_ctl_close(const int id, const int minor, const int force)
+{
+       int err;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(message));
+       message.type = TAPDISK_MESSAGE_CLOSE;
+       if (force)
+               message.type = TAPDISK_MESSAGE_FORCE_SHUTDOWN;
+       message.cookie = minor;
+
+       err = tap_ctl_connect_send_and_receive(id, &message, 5);
+       if (err)
+               return err;
+
+       if (message.type == TAPDISK_MESSAGE_CLOSE_RSP) {
+               err = message.u.response.error;
+               if (err)
+                       EPRINTF("close failed: %d\n", err);
+       } else {
+               EPRINTF("got unexpected result '%s' from %d\n",
+                       tapdisk_message_name(message.type), id);
+               err = EINVAL;
+       }
+
+       return err;
+}
+
+int
+tap_ctl_close(const int id, const int minor, const int force)
+{
+       int i, err;
+
+       for (i = 0; i < 20; i++) {
+               err = __tap_ctl_close(id, minor, force);
+               if (!err)
+                       return 0;
+
+               err = (err < 0 ? -err : err);
+               if (err != EAGAIN) {
+                       EPRINTF("close failed: %d\n", err);
+                       return err;
+               }
+
+               usleep(1000);
+       }
+
+       EPRINTF("close timed out\n");
+       return EIO;
+}
diff --git a/tools/blktap2/control/tap-ctl-create.c b/tools/blktap2/control/tap-ctl-create.c
new file mode 100644 (file)
index 0000000..f4c47f1
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+int
+tap_ctl_create(const char *params, char **devname)
+{
+       int err, id, minor;
+
+       err = tap_ctl_allocate(&minor, devname);
+       if (err)
+               return err;
+
+       id = tap_ctl_spawn();
+       if (id < 0) {
+               err = id;
+               goto destroy;
+       }
+
+       err = tap_ctl_attach(id, minor);
+       if (err)
+               goto destroy;
+
+       err = tap_ctl_open(id, minor, params);
+       if (err)
+               goto detach;
+
+       return 0;
+
+detach:
+       tap_ctl_detach(id, minor);
+destroy:
+       tap_ctl_free(minor);
+       return err;
+}
diff --git a/tools/blktap2/control/tap-ctl-destroy.c b/tools/blktap2/control/tap-ctl-destroy.c
new file mode 100644 (file)
index 0000000..dc5dbaa
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+int
+tap_ctl_destroy(const int id, const int minor)
+{
+       int err;
+
+       err = tap_ctl_close(id, minor, 0);
+       if (err)
+               return err;
+
+       err = tap_ctl_detach(id, minor);
+       if (err)
+               return err;
+
+       err = tap_ctl_free(minor);
+       if (err)
+               return err;
+
+       return 0;
+}
diff --git a/tools/blktap2/control/tap-ctl-detach.c b/tools/blktap2/control/tap-ctl-detach.c
new file mode 100644 (file)
index 0000000..7d7bbf3
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_detach(const int id, const int minor)
+{
+       int err;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(message));
+       message.type = TAPDISK_MESSAGE_DETACH;
+       message.cookie = minor;
+
+       err = tap_ctl_connect_send_and_receive(id, &message, 5);
+       if (err)
+               return err;
+
+       if (message.type == TAPDISK_MESSAGE_DETACH_RSP) {
+               err = message.u.response.error;
+               if (err < 0)
+                       printf("detach failed: %d\n", err);
+       } else {
+               printf("got unexpected result '%s' from %d\n",
+                      tapdisk_message_name(message.type), id);
+               err = EINVAL;
+       }
+
+       return err;
+}
diff --git a/tools/blktap2/control/tap-ctl-free.c b/tools/blktap2/control/tap-ctl-free.c
new file mode 100644 (file)
index 0000000..9ae7295
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/ioctl.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+int
+tap_ctl_free(const int minor)
+{
+       int fd, err;
+
+       fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY);
+       if (fd == -1) {
+               EPRINTF("failed to open control device: %d\n", errno);
+               return errno;
+       }
+
+       err = ioctl(fd, BLKTAP2_IOCTL_FREE_TAP, minor);
+       close(fd);
+
+       return err;
+}
diff --git a/tools/blktap2/control/tap-ctl-ipc.c b/tools/blktap2/control/tap-ctl-ipc.c
new file mode 100644 (file)
index 0000000..c7e42d9
--- /dev/null
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+int tap_ctl_debug = 0;
+
+int
+tap_ctl_read_message(int fd, tapdisk_message_t *message, int timeout)
+{
+       fd_set readfds;
+       int ret, len, offset;
+       struct timeval tv, *t;
+
+       t      = NULL;
+       offset = 0;
+       len    = sizeof(tapdisk_message_t);
+
+       if (timeout) {
+               tv.tv_sec  = timeout;
+               tv.tv_usec = 0;
+               t = &tv;
+       }
+
+       memset(message, 0, sizeof(tapdisk_message_t));
+
+       while (offset < len) {
+               FD_ZERO(&readfds);
+               FD_SET(fd, &readfds);
+
+               ret = select(fd + 1, &readfds, NULL, NULL, t);
+               if (ret == -1) {
+                       if (errno == EINTR)
+                               continue;
+                       break;
+               }
+               else if (FD_ISSET(fd, &readfds)) {
+                       ret = read(fd, message + offset, len - offset);
+                       if (ret <= 0) {
+                               if (errno == EINTR)
+                                       continue;
+                               break;
+                       }
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len) {
+               EPRINTF("failure reading message\n");
+               return -EIO;
+       }
+
+       DBG("received '%s' message (uuid = %u)\n",
+           tapdisk_message_name(message->type), message->cookie);
+
+       return 0;
+}
+
+int
+tap_ctl_write_message(int fd, tapdisk_message_t *message, int timeout)
+{
+       fd_set writefds;
+       int ret, len, offset;
+       struct timeval tv, *t;
+
+       t      = NULL;
+       offset = 0;
+       len    = sizeof(tapdisk_message_t);
+
+       if (timeout) {
+               tv.tv_sec  = timeout;
+               tv.tv_usec = 0;
+               t = &tv;
+       }
+
+       DBG("sending '%s' message (uuid = %u)\n",
+           tapdisk_message_name(message->type), message->cookie);
+
+       while (offset < len) {
+               FD_ZERO(&writefds);
+               FD_SET(fd, &writefds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(fd + 1, NULL, &writefds, NULL, t);
+               if (ret == -1) {
+                       if (errno == EINTR)
+                               continue;
+                       break;
+               }
+               else if (FD_ISSET(fd, &writefds)) {
+                       ret = write(fd, message + offset, len - offset);
+                       if (ret <= 0) {
+                               if (errno == EINTR)
+                                       continue;
+                               break;
+                       }
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len) {
+               EPRINTF("failure writing message\n");
+               return -EIO;
+       }
+
+       return 0;
+}
+
+int
+tap_ctl_send_and_receive(int sfd, tapdisk_message_t *message, int timeout)
+{
+       int err;
+
+       err = tap_ctl_write_message(sfd, message, timeout);
+       if (err) {
+               EPRINTF("failed to send '%s' message\n",
+                       tapdisk_message_name(message->type));
+               return err;
+       }
+
+       err = tap_ctl_read_message(sfd, message, timeout);
+       if (err) {
+               EPRINTF("failed to receive '%s' message\n",
+                       tapdisk_message_name(message->type));
+               return err;
+       }
+
+       return 0;
+}
+
+char *
+tap_ctl_socket_name(int id)
+{
+       char *name;
+
+       if (asprintf(&name, "%s/%s%d",
+                    BLKTAP2_CONTROL_DIR, BLKTAP2_CONTROL_SOCKET, id) == -1)
+               return NULL;
+
+       return name;
+}
+
+int
+tap_ctl_connect(const char *name, int *sfd)
+{
+       int fd, err;
+       struct sockaddr_un saddr;
+
+       *sfd = -1;
+
+       fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (fd == -1) {
+               EPRINTF("couldn't create socket for %s: %d\n", name, errno);
+               return -errno;
+       }
+
+       memset(&saddr, 0, sizeof(saddr));
+       saddr.sun_family = AF_UNIX;
+       strcpy(saddr.sun_path, name);
+
+       err = connect(fd, (const struct sockaddr *)&saddr, sizeof(saddr));
+       if (err) {
+               EPRINTF("couldn't connect to %s: %d\n", name, errno);
+               close(fd);
+               return -errno;
+       }
+
+       *sfd = fd;
+       return 0;
+}
+
+int
+tap_ctl_connect_id(int id, int *sfd)
+{
+       int err;
+       char *name;
+
+       *sfd = -1;
+
+       if (id < 0) {
+               EPRINTF("invalid id %d\n", id);
+               return -EINVAL;
+       }
+
+       name = tap_ctl_socket_name(id);
+       if (!name) {
+               EPRINTF("couldn't name socket for %d\n", id);
+               return -ENOMEM;
+       }
+
+       err = tap_ctl_connect(name, sfd);
+       free(name);
+
+       return err;
+}
+
+int
+tap_ctl_connect_send_and_receive(int id, tapdisk_message_t *message, int timeout)
+{
+       int err, sfd;
+
+       err = tap_ctl_connect_id(id, &sfd);
+       if (err)
+               return err;
+
+       err = tap_ctl_send_and_receive(sfd, message, timeout);
+
+       close(sfd);
+       return err;
+}
diff --git a/tools/blktap2/control/tap-ctl-list.c b/tools/blktap2/control/tap-ctl-list.c
new file mode 100644 (file)
index 0000000..f8d49c3
--- /dev/null
@@ -0,0 +1,536 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <glob.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+#include "list.h"
+
+static void
+free_list(tap_list_t *entry)
+{
+       if (entry->type) {
+               free(entry->type);
+               entry->type = NULL;
+       }
+
+       if (entry->path) {
+               free(entry->path);
+               entry->path = NULL;
+       }
+
+       free(entry);
+}
+
+int
+_parse_params(const char *params, char **type, char **path)
+{
+       char *ptr;
+       size_t len;
+
+       ptr = strchr(params, ':');
+       if (!ptr)
+               return -EINVAL;
+
+       len = ptr - params;
+
+       *type = strndup(params, len);
+       *path =  strdup(params + len + 1);
+
+       if (!*type || !*path) {
+               free(*type);
+               *type = NULL;
+
+               free(*path);
+               *path = NULL;
+
+               return -errno;
+       }
+
+       return 0;
+}
+
+static int
+init_list(tap_list_t *entry,
+         int tap_id, pid_t tap_pid, int vbd_minor, int vbd_state,
+         const char *params)
+{
+       int err = 0;
+
+       entry->id     = tap_id;
+       entry->pid    = tap_pid;
+       entry->minor  = vbd_minor;
+       entry->state  = vbd_state;
+
+       if (params)
+               err = _parse_params(params, &entry->type, &entry->path);
+
+       return err;
+}
+
+void
+tap_ctl_free_list(tap_list_t **list)
+{
+       tap_list_t **_entry;
+
+       for (_entry = list; *_entry != NULL; ++_entry)
+               free_list(*_entry);
+
+       free(list);
+}
+
+static tap_list_t**
+tap_ctl_alloc_list(int n)
+{
+       tap_list_t **list, *entry;
+       size_t size;
+       int i;
+
+       size = sizeof(tap_list_t*) * (n+1);
+       list = malloc(size);
+       if (!list)
+               goto fail;
+
+       memset(list, 0, size);
+
+       for (i = 0; i < n; ++i) {
+               tap_list_t *entry;
+
+               entry = malloc(sizeof(tap_list_t));
+               if (!entry)
+                       goto fail;
+
+               memset(entry, 0, sizeof(tap_list_t));
+
+               list[i] = entry;
+       }
+
+       return list;
+
+fail:
+       if (list)
+               tap_ctl_free_list(list);
+
+       return NULL;
+}
+
+static int
+tap_ctl_list_length(const tap_list_t **list)
+{
+       const tap_list_t **_entry;
+       int n;
+
+       n = 0;
+       for (_entry = list; *_entry != NULL; ++_entry)
+               n++;
+
+       return n;
+}
+
+static int
+_tap_minor_cmp(const void *a, const void *b)
+{
+       return *(int*)a - *(int*)b;
+}
+
+int
+_tap_ctl_find_minors(int **_minorv)
+{
+       glob_t glbuf = { 0 };
+       const char *pattern, *format;
+       int *minorv = NULL, n_minors = 0;
+       int err, i;
+
+       pattern = BLKTAP2_SYSFS_DIR"/blktap*";
+       format  = BLKTAP2_SYSFS_DIR"/blktap%d";
+
+       n_minors = 0;
+       minorv   = NULL;
+
+       err = glob(pattern, 0, NULL, &glbuf);
+       switch (err) {
+       case GLOB_NOMATCH:
+               goto done;
+
+       case GLOB_ABORTED:
+       case GLOB_NOSPACE:
+               err = -errno;
+               EPRINTF("%s: glob failed, err %d", pattern, err);
+               goto fail;
+       }
+
+       minorv = malloc(sizeof(int) * glbuf.gl_pathc);
+       if (!minorv) {
+               err = -errno;
+               goto fail;
+       }
+
+       for (i = 0; i < glbuf.gl_pathc; ++i) {
+               int n;
+
+               n = sscanf(glbuf.gl_pathv[i], format, &minorv[n_minors]);
+               if (n != 1)
+                       continue;
+
+               n_minors++;
+       }
+
+       qsort(minorv, n_minors, sizeof(int), _tap_minor_cmp);
+
+done:
+       *_minorv = minorv;
+       err = 0;
+
+out:
+       if (glbuf.gl_pathv)
+               globfree(&glbuf);
+
+       return err ? : n_minors;
+
+fail:
+       if (minorv)
+               free(minorv);
+
+       goto out;
+}
+
+struct tapdisk {
+       int    id;
+       pid_t  pid;
+       struct list_head list;
+};
+
+static int
+_tap_tapdisk_cmp(const void *a, const void *b)
+{
+       return ((struct tapdisk*)a)->id - ((struct tapdisk*)b)->id;
+}
+
+int
+_tap_ctl_find_tapdisks(struct tapdisk **_tapv)
+{
+       glob_t glbuf = { 0 };
+       const char *pattern, *format;
+       struct tapdisk *tapv = NULL;
+       int err, i, n_taps = 0;
+
+       pattern = BLKTAP2_CONTROL_DIR"/"BLKTAP2_CONTROL_SOCKET"*";
+       format  = BLKTAP2_CONTROL_DIR"/"BLKTAP2_CONTROL_SOCKET"%d";
+
+       n_taps = 0;
+       tapv   = NULL;
+
+       err = glob(pattern, 0, NULL, &glbuf);
+       switch (err) {
+       case GLOB_NOMATCH:
+               goto done;
+
+       case GLOB_ABORTED:
+       case GLOB_NOSPACE:
+               err = -errno;
+               EPRINTF("%s: glob failed, err %d", pattern, err);
+               goto fail;
+       }
+
+       tapv = malloc(sizeof(struct tapdisk) * glbuf.gl_pathc);
+       if (!tapv) {
+               err = -errno;
+               goto fail;
+       }
+
+       for (i = 0; i < glbuf.gl_pathc; ++i) {
+               struct tapdisk *tap;
+               int n;
+
+               tap = &tapv[n_taps];
+
+               err = sscanf(glbuf.gl_pathv[i], format, &tap->id);
+               if (err != 1)
+                       continue;
+
+               tap->pid = tap_ctl_get_pid(tap->id);
+               if (tap->pid < 0)
+                       continue;
+
+               n_taps++;
+       }
+
+       qsort(tapv, n_taps, sizeof(struct tapdisk), _tap_tapdisk_cmp);
+
+       for (i = 0; i < n_taps; ++i)
+               INIT_LIST_HEAD(&tapv[i].list);
+
+done:
+       *_tapv = tapv;
+       err = 0;
+
+out:
+       if (glbuf.gl_pathv)
+               globfree(&glbuf);
+
+       return err ? : n_taps;
+
+fail:
+       if (tapv)
+               free(tapv);
+
+       goto out;
+}
+
+struct tapdisk_list {
+       int  minor;
+       int  state;
+       char *params;
+       struct list_head entry;
+};
+
+int
+_tap_ctl_list_tapdisk(int id, struct list_head *_list)
+{
+       tapdisk_message_t message;
+       struct list_head list;
+       struct tapdisk_list *tl, *next;
+       int err, sfd;
+
+       err = tap_ctl_connect_id(id, &sfd);
+       if (err)
+               return err;
+
+       memset(&message, 0, sizeof(message));
+       message.type   = TAPDISK_MESSAGE_LIST;
+       message.cookie = -1;
+
+       err = tap_ctl_write_message(sfd, &message, 2);
+       if (err)
+               return err;
+
+       INIT_LIST_HEAD(&list);
+       do {
+               err = tap_ctl_read_message(sfd, &message, 2);
+               if (err) {
+                       err = -EPROTO;
+                       break;
+               }
+
+               if (message.u.list.count == 0)
+                       break;
+
+               tl = malloc(sizeof(struct tapdisk_list));
+               if (!tl) {
+                       err = -ENOMEM;
+                       break;
+               }
+
+               tl->minor  = message.u.list.minor;
+               tl->state  = message.u.list.state;
+               if (message.u.list.path[0] != 0) {
+                       tl->params = strndup(message.u.list.path,
+                                            sizeof(message.u.list.path));
+                       if (!tl->params) {
+                               err = -errno;
+                               break;
+                       }
+               } else
+                       tl->params = NULL;
+
+               list_add(&tl->entry, &list);
+       } while (1);
+
+       if (err)
+               list_for_each_entry_safe(tl, next, &list, entry) {
+                       list_del(&tl->entry);
+                       free(tl->params);
+                       free(tl);
+               }
+
+       close(sfd);
+       list_splice(&list, _list);
+       return err;
+}
+
+void
+_tap_ctl_free_tapdisks(struct tapdisk *tapv, int n_taps)
+{
+       struct tapdisk *tap;
+
+       for (tap = tapv; tap < &tapv[n_taps]; ++tap) {
+               struct tapdisk_list *tl, *next;
+
+               list_for_each_entry_safe(tl, next, &tap->list, entry) {
+                       free(tl->params);
+                       free(tl);
+               }
+       }
+
+       free(tapv);
+}
+
+int
+_tap_list_join3(int n_minors, int *minorv, int n_taps, struct tapdisk *tapv,
+               tap_list_t ***_list)
+{
+       tap_list_t **list, **_entry;
+       int i, _m, err;
+
+       list = tap_ctl_alloc_list(n_minors + n_taps);
+       if (!list) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       _entry = list;
+
+       for (i = 0; i < n_taps; ++i) {
+               struct tapdisk *tap = &tapv[i];
+               struct tapdisk_list *tl;
+
+               /* orphaned tapdisk */
+               if (list_empty(&tap->list)) {
+                       err = init_list(*_entry++, tap->id, tap->pid, -1, -1, NULL);
+                       if (err)
+                               goto fail;
+                       continue;
+               }
+
+               list_for_each_entry(tl, &tap->list, entry) {
+
+                       err = init_list(*_entry++,
+                                       tap->id, tap->pid,
+                                       tl->minor, tl->state, tl->params);
+                       if (err)
+                               goto fail;
+
+                       if (tl->minor >= 0) {
+                               /* clear minor */
+                               for (_m = 0; _m < n_minors; ++_m) {
+                                       if (minorv[_m] == tl->minor) {
+                                               minorv[_m] = -1;
+                                               break;
+                                       }
+                               }
+                       }
+               }
+       }
+
+       /* orphaned minors */
+       for (_m = 0; _m < n_minors; ++_m) {
+               int minor = minorv[_m];
+               if (minor >= 0) {
+                       err = init_list(*_entry++, -1, -1, minor, -1, NULL);
+                       if (err)
+                               goto fail;
+               }
+       }
+
+       /* free extraneous list entries */
+       for (; *_entry != NULL; ++_entry) {
+               free_list(*_entry);
+               *_entry = NULL;
+       }
+
+       *_list = list;
+
+       return 0;
+
+fail:
+       if (list)
+               tap_ctl_free_list(list);
+
+       return err;
+}
+
+int
+tap_ctl_list(tap_list_t ***list)
+{
+       int n_taps, n_minors, err, *minorv;
+       struct tapdisk *tapv, *tap;
+
+       n_taps   = -1;
+       n_minors = -1;
+
+       err = n_minors = _tap_ctl_find_minors(&minorv);
+       if (err < 0)
+               goto out;
+
+       err = n_taps = _tap_ctl_find_tapdisks(&tapv);
+       if (err < 0)
+               goto out;
+
+       for (tap = tapv; tap < &tapv[n_taps]; ++tap) {
+               err = _tap_ctl_list_tapdisk(tap->id, &tap->list);
+               if (err)
+                       goto out;
+       }
+
+       err = _tap_list_join3(n_minors, minorv, n_taps, tapv, list);
+
+out:
+       if (n_taps > 0)
+               _tap_ctl_free_tapdisks(tapv, n_taps);
+
+       if (n_minors > 0)
+               free(minorv);
+
+       return err;
+}
+
+int
+tap_ctl_find(const char *type, const char *path, tap_list_t *tap)
+{
+       tap_list_t **list, **_entry;
+       int ret = -ENOENT, err;
+
+       err = tap_ctl_list(&list);
+       if (err)
+               return err;
+
+       for (_entry = list; *_entry != NULL; ++_entry) {
+               tap_list_t *entry  = *_entry;
+
+               if (type && (!entry->type || strcmp(entry->type, type)))
+                       continue;
+
+               if (path && (!entry->path || strcmp(entry->path, path)))
+                       continue;
+
+               *tap = *entry;
+               tap->type = tap->path = NULL;
+               ret = 0;
+               break;
+       }
+
+       tap_ctl_free_list(list);
+
+       return ret;
+}
diff --git a/tools/blktap2/control/tap-ctl-major.c b/tools/blktap2/control/tap-ctl-major.c
new file mode 100644 (file)
index 0000000..847af28
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_blk_major(void)
+{
+       FILE *devices;
+       int rv, major;
+
+       devices = fopen("/proc/devices", "r");
+       if (!devices) {
+               rv = -errno;
+               goto out;
+       }
+
+       do {
+               char buf[32], *s;
+               int n, offset;
+
+               s = fgets(buf, sizeof(buf), devices);
+               if (!s)
+                       break;
+
+               major  = -ENODEV;
+               offset = 0;
+
+               n = sscanf(buf, "%d tapdev%n", &major, &offset);
+               if (n == 1 && offset)
+                       break;
+       } while (1);
+
+       rv = major;
+
+out:
+       if (devices)
+               fclose(devices);
+
+       return rv;
+}
diff --git a/tools/blktap2/control/tap-ctl-open.c b/tools/blktap2/control/tap-ctl-open.c
new file mode 100644 (file)
index 0000000..5961c99
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+#include "blktaplib.h"
+
+int
+tap_ctl_open(const int id, const int minor, const char *params)
+{
+       int err;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(message));
+       message.type = TAPDISK_MESSAGE_OPEN;
+       message.cookie = minor;
+       message.u.params.storage = TAPDISK_STORAGE_TYPE_DEFAULT;
+       message.u.params.devnum = minor;
+
+       err = snprintf(message.u.params.path,
+                      sizeof(message.u.params.path) - 1, "%s", params);
+       if (err >= sizeof(message.u.params.path)) {
+               EPRINTF("name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       err = tap_ctl_connect_send_and_receive(id, &message, 5);
+       if (err)
+               return err;
+
+       switch (message.type) {
+       case TAPDISK_MESSAGE_OPEN_RSP:
+               break;
+       case TAPDISK_MESSAGE_ERROR:
+               err = -message.u.response.error;
+               EPRINTF("open failed, err %d\n", err);
+               break;
+       default:
+               EPRINTF("got unexpected result '%s' from %d\n",
+                       tapdisk_message_name(message.type), id);
+               err = EINVAL;
+       }
+
+       return err;
+}
diff --git a/tools/blktap2/control/tap-ctl-pause.c b/tools/blktap2/control/tap-ctl-pause.c
new file mode 100644 (file)
index 0000000..5e31a58
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_pause(const int id, const int minor)
+{
+       int err;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(message));
+       message.type = TAPDISK_MESSAGE_PAUSE;
+       message.cookie = minor;
+
+       err = tap_ctl_connect_send_and_receive(id, &message, 5);
+       if (err)
+               return err;
+
+       if (message.type == TAPDISK_MESSAGE_PAUSE_RSP)
+               err = message.u.response.error;
+       else {
+               err = EINVAL;
+               EPRINTF("got unexpected result '%s' from %d\n",
+                       tapdisk_message_name(message.type), id);
+       }
+
+       return err;
+}
diff --git a/tools/blktap2/control/tap-ctl-spawn.c b/tools/blktap2/control/tap-ctl-spawn.c
new file mode 100644 (file)
index 0000000..31a651e
--- /dev/null
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+static pid_t
+__tap_ctl_spawn(int *readfd)
+{
+       int err, child, channel[2];
+       char *tapdisk;
+
+       if (pipe(channel)) {
+               EPRINTF("pipe failed: %d\n", errno);
+               return -errno;
+       }
+
+       if ((child = fork()) == -1) {
+               EPRINTF("fork failed: %d\n", errno);
+               return -errno;
+       }
+
+       if (child) {
+               close(channel[1]);
+               *readfd = channel[0];
+               return child;
+       }
+
+       if (dup2(channel[1], STDOUT_FILENO) == -1) {
+               EPRINTF("dup2 failed: %d\n", errno);
+               exit(errno);
+       }
+
+       if (dup2(channel[1], STDERR_FILENO) == -1) {
+               EPRINTF("dup2 failed: %d\n", errno);
+               exit(errno);
+       }
+
+       close(channel[0]);
+       close(channel[1]);
+
+       tapdisk = getenv("TAPDISK2");
+       if (!tapdisk)
+               tapdisk = "tapdisk2";
+
+       execlp(tapdisk, tapdisk, NULL);
+
+       EPRINTF("exec failed\n");
+       exit(1);
+}
+
+pid_t
+tap_ctl_get_pid(const int id)
+{
+       int err;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(message));
+       message.type = TAPDISK_MESSAGE_PID;
+
+       err = tap_ctl_connect_send_and_receive(id, &message, 2);
+       if (err)
+               return err;
+
+       return message.u.tapdisk_pid;
+}
+
+static int
+tap_ctl_wait(pid_t child)
+{
+       pid_t pid;
+       int status;
+
+       pid = waitpid(child, &status, 0);
+       if (pid < 0) {
+               EPRINTF("wait(%d) failed, err %d\n", child, errno);
+               return -errno;
+       }
+
+       if (WIFEXITED(status)) {
+               int code = WEXITSTATUS(status);
+               if (code)
+                       EPRINTF("tapdisk2[%d] failed, status %d\n", child, code);
+               return -code;
+       }
+
+       if (WIFSIGNALED(status)) {
+               int signo = WTERMSIG(status);
+               EPRINTF("tapdisk2[%d] killed by signal %d\n", child, signo);
+               return -EINTR;
+       }
+
+       EPRINTF("tapdisk2[%d]: unexpected status %#x\n", child, status);
+       return -EAGAIN;
+}
+
+static int
+tap_ctl_get_child_id(int readfd)
+{
+       int id;
+       FILE *f;
+
+       f = fdopen(readfd, "r");
+       if (!f) {
+               EPRINTF("fdopen failed: %d\n", errno);
+               return -1;
+       }
+
+       errno = 0;
+       if (fscanf(f, BLKTAP2_CONTROL_DIR"/"
+                  BLKTAP2_CONTROL_SOCKET"%d", &id) != 1) {
+               errno = (errno ? : EINVAL);
+               EPRINTF("parsing id failed: %d\n", errno);
+               id = -1;
+       }
+
+       fclose(f);
+       return id;
+}
+
+int
+tap_ctl_spawn(void)
+{
+       pid_t child;
+       int err, id, readfd;
+
+       readfd = -1;
+
+       child = __tap_ctl_spawn(&readfd);
+       if (child < 0)
+               return child;
+
+       err = tap_ctl_wait(child);
+       if (err)
+               return err;
+
+       id = tap_ctl_get_child_id(readfd);
+       if (id < 0)
+               EPRINTF("get_id failed, child %d err %d\n", child, errno);
+
+       return id;
+}
diff --git a/tools/blktap2/control/tap-ctl-unpause.c b/tools/blktap2/control/tap-ctl-unpause.c
new file mode 100644 (file)
index 0000000..dfb7450
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_unpause(const int id, const int minor, const char *params)
+{
+       int err;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(message));
+       message.type = TAPDISK_MESSAGE_RESUME;
+       message.cookie = minor;
+
+       if (params)
+               strncpy(message.u.params.path, params,
+                       sizeof(message.u.params.path) - 1);
+
+       err = tap_ctl_connect_send_and_receive(id, &message, 15);
+       if (err)
+               return err;
+
+       if (message.type == TAPDISK_MESSAGE_RESUME_RSP)
+               err = message.u.response.error;
+       else {
+               err = EINVAL;
+               EPRINTF("got unexpected result '%s' from %d\n",
+                       tapdisk_message_name(message.type), id);
+       }
+
+       return err;
+}
diff --git a/tools/blktap2/control/tap-ctl.c b/tools/blktap2/control/tap-ctl.c
new file mode 100644 (file)
index 0000000..e254f07
--- /dev/null
@@ -0,0 +1,815 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+
+typedef int (*tap_ctl_func_t) (int, char **);
+
+struct command {
+       char                     *name;
+       tap_ctl_func_t            func;
+};
+
+static void
+tap_cli_list_usage(FILE *stream)
+{
+       fprintf(stream,
+               "usage: list [-h] [-p pid] [-m minor] [-t type] [-f file]\n");
+}
+
+static void
+tap_ctl_list_row(tap_list_t *entry)
+{
+       char minor_str[10] = "-";
+       char state_str[10] = "-";
+       char pid_str[10]   = "-";
+
+       if (entry->pid != -1)
+               sprintf(pid_str, "%d", entry->pid);
+
+       if (entry->minor != -1)
+               sprintf(minor_str, "%d", entry->minor);
+
+       if (entry->state != -1)
+               sprintf(state_str, "%x", entry->state);
+
+       printf("%8s %2s %4s %10s %s\n",
+              pid_str, minor_str, state_str,
+              entry->type ? : "-", entry->path ? : "-");
+}
+
+static void
+tap_ctl_list_dict(tap_list_t *entry)
+{
+       int d = 0;
+
+       if (entry->pid != -1) {
+               if (d) putc(' ', stdout);
+               d = printf("pid=%d", entry->pid);
+       }
+
+       if (entry->minor != -1) {
+               if (d) putc(' ', stdout);
+               d = printf("minor=%d", entry->minor);
+       }
+
+       if (entry->state != -1) {
+               if (d) putc(' ', stdout);
+               d = printf("state=%d", entry->state);
+       }
+
+       if (entry->type && entry->path) {
+               if (d) putc(' ', stdout);
+               d = printf("args=%s:%s", entry->type, entry->path);
+       }
+
+       putc('\n', stdout);
+}
+
+int
+tap_cli_list(int argc, char **argv)
+{
+       tap_list_t **list, **_entry;
+       int c, minor, tty, err;
+       const char *type, *file;
+       pid_t pid;
+
+       err = tap_ctl_list(&list);
+       if (err)
+               return -err;
+
+       pid   = -1;
+       minor = -1;
+       type  = NULL;
+       file  = NULL;
+
+       while ((c = getopt(argc, argv, "m:p:t:f:h")) != -1) {
+               switch (c) {
+               case 'm':
+                       minor = atoi(optarg);
+                       break;
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 't':
+                       type = optarg;
+                       break;
+               case 'f':
+                       file = optarg;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_list_usage(stdout);
+                       return 0;
+               }
+       }
+
+       tty = isatty(STDOUT_FILENO);
+
+       for (_entry = list; *_entry != NULL; ++_entry) {
+               tap_list_t *entry  = *_entry;
+
+               if (minor >= 0 && entry->minor != minor)
+                       continue;
+
+               if (pid >= 0 && entry->pid != pid)
+                       continue;
+
+               if (type && (!entry->type || strcmp(entry->type, type)))
+                       continue;
+
+               if (file && (!entry->path || strcmp(entry->path, file)))
+                       continue;
+
+               if (tty)
+                       tap_ctl_list_row(entry);
+               else
+                       tap_ctl_list_dict(entry);
+       }
+
+       tap_ctl_free_list(list);
+
+       return 0;
+
+usage:
+       tap_cli_list_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_allocate_usage(FILE *stream)
+{
+       fprintf(stream, "usage: allocate [-d device name]>\n");
+}
+
+static int
+tap_cli_allocate(int argc, char **argv)
+{
+       char *devname;
+       int c, minor, err;
+
+       devname = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "d:h")) != -1) {
+               switch (c) {
+               case 'd':
+                       devname = optarg;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_allocate_usage(stdout);
+                       return 0;
+               }
+       }
+
+       err = tap_ctl_allocate(&minor, &devname);
+       if (!err)
+               printf("%s\n", devname);
+
+       return err;
+
+usage:
+       tap_cli_allocate_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_free_usage(FILE *stream)
+{
+       fprintf(stream, "usage: free <-m minor>\n");
+}
+
+static int
+tap_cli_free(int argc, char **argv)
+{
+       int c, minor;
+
+       minor = -1;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "m:h")) != -1) {
+               switch (c) {
+               case 'm':
+                       minor = atoi(optarg);
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_free_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (minor == -1)
+               goto usage;
+
+       return tap_ctl_free(minor);
+
+usage:
+       tap_cli_free_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_create_usage(FILE *stream)
+{
+       fprintf(stream, "usage: create <-a args> [-d device name]\n");
+}
+
+static int
+tap_cli_create(int argc, char **argv)
+{
+       int c, err;
+       char *args, *devname;
+
+       args    = NULL;
+       devname = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "a:d:h")) != -1) {
+               switch (c) {
+               case 'a':
+                       args = optarg;
+                       break;
+               case 'd':
+                       devname = optarg;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_create_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (!args)
+               goto usage;
+
+       err = tap_ctl_create(args, &devname);
+       if (!err)
+               printf("%s\n", devname);
+
+       return err;
+
+usage:
+       tap_cli_create_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_destroy_usage(FILE *stream)
+{
+       fprintf(stream, "usage: destroy <-p pid> <-m minor>\n");
+}
+
+static int
+tap_cli_destroy(int argc, char **argv)
+{
+       int c, pid, minor;
+
+       pid   = -1;
+       minor = -1;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:m:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'm':
+                       minor = atoi(optarg);
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_destroy_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || minor == -1)
+               goto usage;
+
+       return tap_ctl_destroy(pid, minor);
+
+usage:
+       tap_cli_destroy_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_spawn_usage(FILE *stream)
+{
+       fprintf(stream, "usage: spawn\n");
+}
+
+static int
+tap_cli_spawn(int argc, char **argv)
+{
+       int c;
+       pid_t task;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "h")) != -1) {
+               switch (c) {
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_spawn_usage(stdout);
+                       return 0;
+               }
+       }
+
+       task = tap_ctl_spawn();
+       if (task < 0) {
+               printf("spawn failed: %d\n", errno);
+               return task;
+       }
+
+       printf("tapdisk spawned with pid %d\n", task);
+       return 0;
+
+usage:
+       tap_cli_spawn_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_attach_usage(FILE *stream)
+{
+       fprintf(stream, "usage: attach <-p pid> <-m minor>\n");
+}
+
+static int
+tap_cli_attach(int argc, char **argv)
+{
+       int c, pid, minor;
+
+       pid   = -1;
+       minor = -1;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:m:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'm':
+                       minor = atoi(optarg);
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_attach_usage(stderr);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || minor == -1)
+               goto usage;
+
+       return tap_ctl_attach(pid, minor);
+
+usage:
+       tap_cli_attach_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_detach_usage(FILE *stream)
+{
+       fprintf(stream, "usage: detach <-p pid> <-m minor>\n");
+}
+
+static int
+tap_cli_detach(int argc, char **argv)
+{
+       int c, pid, minor;
+
+       pid   = -1;
+       minor = -1;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:m:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'm':
+                       minor = atoi(optarg);
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_detach_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || minor == -1)
+               goto usage;
+
+       return tap_ctl_detach(pid, minor);
+
+usage:
+       tap_cli_detach_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_close_usage(FILE *stream)
+{
+       fprintf(stream, "usage: close <-p pid> <-m minor> [-f force]\n");
+}
+
+static int
+tap_cli_close(int argc, char **argv)
+{
+       int c, pid, minor, force;
+
+       pid   = -1;
+       minor = -1;
+       force = 0;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:m:fh")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'm':
+                       minor = atoi(optarg);
+                       break;
+               case 'f':
+                       force = -1;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_close_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || minor == -1)
+               goto usage;
+
+       return tap_ctl_close(pid, minor, force);
+
+usage:
+       tap_cli_close_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_pause_usage(FILE *stream)
+{
+       fprintf(stream, "usage: pause <-p pid> <-m minor>\n");
+}
+
+static int
+tap_cli_pause(int argc, char **argv)
+{
+       int c, pid, minor;
+
+       pid   = -1;
+       minor = -1;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:m:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'm':
+                       minor = atoi(optarg);
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_pause_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || minor == -1)
+               goto usage;
+
+       return tap_ctl_pause(pid, minor);
+
+usage:
+       tap_cli_pause_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_unpause_usage(FILE *stream)
+{
+       fprintf(stream, "usage: unpause <-p pid> <-m minor> [-a args]\n");
+}
+
+int
+tap_cli_unpause(int argc, char **argv)
+{
+       const char *args;
+       int c, pid, minor;
+
+       pid   = -1;
+       minor = -1;
+       args  = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:m:a:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'm':
+                       minor = atoi(optarg);
+                       break;
+               case 'a':
+                       args = optarg;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_unpause_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || minor == -1)
+               goto usage;
+
+       return tap_ctl_unpause(pid, minor, args);
+
+usage:
+       tap_cli_unpause_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_major_usage(FILE *stream)
+{
+       fprintf(stream, "usage: major [-h]\n");
+}
+
+static int
+tap_cli_major(int argc, char **argv)
+{
+       int c, chr, major;
+
+       chr = 0;
+
+       while ((c = getopt(argc, argv, "bch")) != -1) {
+               switch (c) {
+               case 'b':
+                       chr = 0;
+                       break;
+               case 'c':
+                       chr = 1;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_major_usage(stdout);
+                       return 0;
+               default:
+                       goto usage;
+               }
+       }
+
+       if (chr)
+               major = -EINVAL;
+       else
+               major = tap_ctl_blk_major();
+
+       if (major < 0)
+               return -major;
+
+       printf("%d\n", major);
+
+       return 0;
+
+usage:
+       tap_cli_major_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_open_usage(FILE *stream)
+{
+       fprintf(stream, "usage: open <-p pid> <-m minor> <-a args>\n");
+}
+
+static int
+tap_cli_open(int argc, char **argv)
+{
+       const char *args;
+       int c, pid, minor;
+
+       pid   = -1;
+       minor = -1;
+       args  = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "a:m:p:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'm':
+                       minor = atoi(optarg);
+                       break;
+               case 'a':
+                       args = optarg;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_open_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || minor == -1 || !args)
+               goto usage;
+
+       return tap_ctl_open(pid, minor, args);
+
+usage:
+       tap_cli_open_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_check_usage(FILE *stream)
+{
+       fprintf(stream, "usage: check\n"
+               "(checks whether environment is suitable for tapdisk2)\n");
+}
+
+static int
+tap_cli_check(int argc, char **argv)
+{
+       int err;
+       const char *msg;
+
+       if (argc != 1)
+               goto usage;
+
+       err = tap_ctl_check(&msg);
+       printf("%s\n", msg);
+
+       return err;
+
+usage:
+       tap_cli_check_usage(stderr);
+       return EINVAL;
+}
+
+struct command commands[] = {
+       { .name = "list",         .func = tap_cli_list          },
+       { .name = "allocate",     .func = tap_cli_allocate      },
+       { .name = "free",         .func = tap_cli_free          },
+       { .name = "create",       .func = tap_cli_create        },
+       { .name = "destroy",      .func = tap_cli_destroy       },
+       { .name = "spawn",        .func = tap_cli_spawn         },
+       { .name = "attach",       .func = tap_cli_attach        },
+       { .name = "detach",       .func = tap_cli_detach        },
+       { .name = "open",         .func = tap_cli_open          },
+       { .name = "close",        .func = tap_cli_close         },
+       { .name = "pause",        .func = tap_cli_pause         },
+       { .name = "unpause",      .func = tap_cli_unpause       },
+       { .name = "major",        .func = tap_cli_major         },
+       { .name = "check",        .func = tap_cli_check         },
+};
+
+#define print_commands()                                       \
+       do {                                                    \
+               int i, n;                                       \
+               n = sizeof(commands) / sizeof(struct command);  \
+               printf("COMMAND := { ");                        \
+               printf("%s", commands[0].name);                 \
+               for (i = 1; i < n; i++)                         \
+                       printf(" | %s", commands[i].name);      \
+               printf(" }\n");                                 \
+       } while (0)
+
+void
+help(void)
+{
+       printf("usage: tap-ctl COMMAND [OPTIONS]\n");
+       print_commands();
+       exit(0);
+}
+
+struct command *
+get_command(char *command)
+{
+       int i, n;
+
+       if (strnlen(command, 25) >= 25)
+               return NULL;
+
+       n = sizeof(commands) / sizeof (struct command);
+
+       for (i = 0; i < n; i++)
+               if (!strcmp(command, commands[i].name))
+                       return &commands[i];
+
+       return NULL;
+}
+
+int
+main(int argc, char *argv[])
+{
+       char **cargv;
+       const char *msg;
+       struct command *cmd;
+       int cargc, i, cnt, ret;
+
+#ifdef CORE_DUMP
+       #include <sys/resource.h>
+       struct rlimit rlim;
+       rlim.rlim_cur = RLIM_INFINITY;
+       rlim.rlim_max = RLIM_INFINITY;
+       if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+               PERROR("setrlimit failed");
+#endif
+
+       ret = 0;
+
+       if (argc < 2)
+               help();
+
+       cargc = argc - 1;
+       cmd   = get_command(argv[1]);
+       if (!cmd) {
+               EPRINTF("invalid COMMAND %s", argv[1]);
+               help();
+       }
+
+       ret = tap_ctl_check(&msg);
+       if (ret) {
+               printf("%s\n", msg);
+               return ret;
+       }
+
+       cargv = malloc(sizeof(char *) * cargc);
+       if (!cargv)
+               exit(ENOMEM);
+
+       cnt      = 1;
+       cargv[0] = cmd->name;
+       for (i = 1; i < cargc; i++) {
+               char *arg = argv[i + (argc - cargc)];
+
+               if (!strcmp(arg, "--debug")) {
+                       tap_ctl_debug = 1;
+                       continue;
+               }
+
+               cargv[cnt++] = arg;
+       }
+
+       ret = cmd->func(cnt, cargv);
+
+       free(cargv);
+
+       return (ret >= 0 ? ret : -ret);
+}
diff --git a/tools/blktap2/control/tap-ctl.h b/tools/blktap2/control/tap-ctl.h
new file mode 100644 (file)
index 0000000..de0bc29
--- /dev/null
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __TAP_CTL_H__
+#define __TAP_CTL_H__
+
+#include <syslog.h>
+#include <errno.h>
+#include <tapdisk-message.h>
+
+extern int tap_ctl_debug;
+
+#ifdef TAPCTL
+#define DBG(_f, _a...)                         \
+       do {                                    \
+               if (tap_ctl_debug)              \
+                       printf(_f, ##_a);       \
+       } while (0)
+
+#define DPRINTF(_f, _a...) syslog(LOG_INFO, _f, ##_a)
+#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a)
+#define  PERROR(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f ": %s", __func__, ##_a, \
+                                 strerror(errno))
+#endif
+
+void tap_ctl_version(int *major, int *minor);
+int tap_ctl_kernel_version(int *major, int *minor);
+
+int tap_ctl_check_blktap(const char **message);
+int tap_ctl_check_version(const char **message);
+int tap_ctl_check(const char **message);
+
+int tap_ctl_connect(const char *path, int *socket);
+int tap_ctl_connect_id(int id, int *socket);
+int tap_ctl_read_message(int fd, tapdisk_message_t *message, int timeout);
+int tap_ctl_write_message(int fd, tapdisk_message_t *message, int timeout);
+int tap_ctl_send_and_receive(int fd, tapdisk_message_t *message, int timeout);
+int tap_ctl_connect_send_and_receive(int id,
+                                    tapdisk_message_t *message, int timeout);
+char *tap_ctl_socket_name(int id);
+
+typedef struct {
+       int         id;
+       pid_t       pid;
+       int         minor;
+       int         state;
+       char       *type;
+       char       *path;
+} tap_list_t;
+
+int tap_ctl_get_driver_id(const char *handle);
+
+int tap_ctl_list(tap_list_t ***list);
+void tap_ctl_free_list(tap_list_t **list);
+int tap_ctl_find(const char *type, const char *path, tap_list_t *tap);
+
+int tap_ctl_allocate(int *minor, char **devname);
+int tap_ctl_free(const int minor);
+
+int tap_ctl_create(const char *params, char **devname);
+int tap_ctl_destroy(const int id, const int minor);
+
+int tap_ctl_spawn(void);
+pid_t tap_ctl_get_pid(const int id);
+
+int tap_ctl_attach(const int id, const int minor);
+int tap_ctl_detach(const int id, const int minor);
+
+int tap_ctl_open(const int id, const int minor, const char *params);
+int tap_ctl_close(const int id, const int minor, const int force);
+
+int tap_ctl_pause(const int id, const int minor);
+int tap_ctl_unpause(const int id, const int minor, const char *params);
+
+int tap_ctl_blk_major(void);
+
+#endif
diff --git a/tools/blktap2/drivers/Makefile b/tools/blktap2/drivers/Makefile
new file mode 100644 (file)
index 0000000..5328c40
--- /dev/null
@@ -0,0 +1,113 @@
+XEN_ROOT=$(CURDIR)/../../..
+BLKTAP_ROOT= ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+LIBVHDDIR  = $(BLKTAP_ROOT)/vhd/lib
+
+IBIN       = tapdisk2 td-util tapdisk-client tapdisk-stream tapdisk-diff
+QCOW_UTIL  = img2qcow qcow-create qcow2raw
+LOCK_UTIL  = lock-util
+INST_DIR   = $(sbindir)
+
+CFLAGS    += -Werror
+CFLAGS    += -Wno-unused
+CFLAGS    += -fno-strict-aliasing
+CFLAGS    += -I$(BLKTAP_ROOT)/include -I$(BLKTAP_ROOT)/drivers
+CFLAGS    += $(CFLAGS_libxenctrl)
+CFLAGS    += -D_GNU_SOURCE
+CFLAGS    += -DUSE_NFS_LOCKS
+# drivers/block-log.c incorrectly uses libxc internals
+CFLAGS    += -I$(XEN_ROOT)/tools/libxc
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS            += -fPIC
+endif
+
+VHDLIBS    := -L$(LIBVHDDIR) -lvhd
+
+REMUS-OBJS  := block-remus.o
+REMUS-OBJS  += hashtable.o
+REMUS-OBJS  += hashtable_itr.o
+REMUS-OBJS  += hashtable_utility.o
+
+tapdisk2 tapdisk-stream tapdisk-diff $(QCOW_UTIL): AIOLIBS := -laio
+
+MEMSHRLIBS :=
+ifeq ($(CONFIG_Linux), __fixme__)
+MEMSHR_DIR = $(XEN_ROOT)/tools/memshr
+CFLAGS += -DMEMSHR
+CFLAGS += -I $(MEMSHR_DIR)
+MEMSHRLIBS += -L$(XEN_ROOT)/tools/libxc -lxenctrl $(MEMSHR_DIR)/libmemshr.a
+endif
+
+ifeq ($(VHD_STATIC),y)
+td-util: CFLAGS += -static
+endif
+
+PORTABLE-OBJS-y :=
+PORTABLE-OBJS-$(CONFIG_Linux)  += blk_linux.o
+PORTABLE-OBJS-$(CONFIG_NetBSD) += blk_netbsd.o
+
+TAP-OBJS-y  := scheduler.o
+TAP-OBJS-y  += tapdisk-vbd.o
+TAP-OBJS-y  += tapdisk-control.o
+TAP-OBJS-y  += tapdisk-image.o
+TAP-OBJS-y  += tapdisk-driver.o
+TAP-OBJS-y  += tapdisk-disktype.o
+TAP-OBJS-y  += tapdisk-interface.o
+TAP-OBJS-y  += tapdisk-server.o
+TAP-OBJS-y  += tapdisk-queue.o
+TAP-OBJS-y  += tapdisk-filter.o
+TAP-OBJS-y  += tapdisk-log.o
+TAP-OBJS-y  += tapdisk-utils.o
+TAP-OBJS-y  += io-optimize.o
+TAP-OBJS-y  += lock.o
+TAP-OBJS-y  += $(PORTABLE-OBJS-y)
+
+MISC-OBJS-y := atomicio.o
+
+BLK-OBJS-y  := block-aio.o
+BLK-OBJS-y  += block-ram.o
+BLK-OBJS-y  += block-cache.o
+BLK-OBJS-y  += block-vhd.o
+BLK-OBJS-y  += block-log.o
+BLK-OBJS-y  += block-qcow.o
+BLK-OBJS-y  += aes.o
+BLK-OBJS-y  += md5.o
+BLK-OBJS-y  += $(PORTABLE-OBJS-y)
+BLK-OBJS-y  += $(REMUS-OBJS)
+
+all: $(IBIN) lock-util qcow-util
+
+
+tapdisk2: $(TAP-OBJS-y) $(BLK-OBJS-y) $(MISC-OBJS-y) tapdisk2.o
+       $(CC) -o $@ $^ $(LDFLAGS) -lrt -lz $(VHDLIBS) $(AIOLIBS) $(MEMSHRLIBS) -lm  $(APPEND_LDFLAGS)
+
+tapdisk-client: tapdisk-client.o
+       $(CC) -o $@ $^ $(LDFLAGS) -lrt $(APPEND_LDFLAGS)
+
+tapdisk-stream tapdisk-diff: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y)
+       $(CC) -o $@ $^ $(LDFLAGS) -lrt -lz $(VHDLIBS) $(AIOLIBS) $(MEMSHRLIBS) -lm $(APPEND_LDFLAGS)
+
+td-util: td.o tapdisk-utils.o tapdisk-log.o $(PORTABLE-OBJS-y)
+       $(CC) -o $@ $^ $(LDFLAGS) $(VHDLIBS) $(APPEND_LDFLAGS)
+
+lock-util: lock.c
+       $(CC) $(CFLAGS) -DUTIL -o lock-util lock.c $(LDFLAGS) $(APPEND_LDFLAGS)
+
+.PHONY: qcow-util
+qcow-util: img2qcow qcow2raw qcow-create
+
+img2qcow qcow2raw qcow-create: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y)
+       $(CC) -o $@ $^ $(LDFLAGS) -lrt -lz $(VHDLIBS) $(AIOLIBS) $(MEMSHRLIBS) -lm $(APPEND_LDFLAGS)
+
+install: all
+       $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+       $(INSTALL_PROG) $(IBIN) $(LOCK_UTIL) $(QCOW_UTIL) $(DESTDIR)$(INST_DIR)
+
+clean:
+       rm -rf .*.d *.o *~ xen TAGS $(IBIN) $(LIB) $(LOCK_UTIL) $(QCOW_UTIL)
+
+distclean: clean
+
+.PHONY: clean install distclean
diff --git a/tools/blktap2/drivers/aes.c b/tools/blktap2/drivers/aes.c
new file mode 100644 (file)
index 0000000..ea81ae5
--- /dev/null
@@ -0,0 +1,1319 @@
+/**\r
+ * \r
+ * aes.c - integrated in QEMU by Fabrice Bellard from the OpenSSL project.\r
+ */\r
+/*\r
+ * rijndael-alg-fst.c\r
+ *\r
+ * @version 3.0 (December 2000)\r
+ *\r
+ * Optimised ANSI C code for the Rijndael cipher (now AES)\r
+ *\r
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>\r
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>\r
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>\r
+ *\r
+ * This code is hereby placed in the public domain.\r
+ *\r
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS\r
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\r
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE\r
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\r
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\r
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR\r
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\r
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE\r
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,\r
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+ */\r
+//#include "vl.h"\r
+#include <inttypes.h>\r
+#include <string.h>\r
+#include "aes.h"\r
+\r
+//#define NDEBUG\r
+#include <assert.h>\r
+\r
+typedef uint32_t u32;\r
+typedef uint16_t u16;\r
+typedef uint8_t u8;\r
+\r
+#define MAXKC   (256/32)\r
+#define MAXKB   (256/8)\r
+#define MAXNR   14\r
+\r
+/* This controls loop-unrolling in aes_core.c */\r
+#undef FULL_UNROLL\r
+# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] <<  8) ^ ((u32)(pt)[3]))\r
+# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >>  8); (ct)[3] = (u8)(st); }\r
+\r
+/*\r
+Te0[x] = S [x].[02, 01, 01, 03];\r
+Te1[x] = S [x].[03, 02, 01, 01];\r
+Te2[x] = S [x].[01, 03, 02, 01];\r
+Te3[x] = S [x].[01, 01, 03, 02];\r
+Te4[x] = S [x].[01, 01, 01, 01];\r
+\r
+Td0[x] = Si[x].[0e, 09, 0d, 0b];\r
+Td1[x] = Si[x].[0b, 0e, 09, 0d];\r
+Td2[x] = Si[x].[0d, 0b, 0e, 09];\r
+Td3[x] = Si[x].[09, 0d, 0b, 0e];\r
+Td4[x] = Si[x].[01, 01, 01, 01];\r
+*/\r
+\r
+static const u32 Te0[256] = {\r
+    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,\r
+    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,\r
+    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,\r
+    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,\r
+    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,\r
+    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,\r
+    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,\r
+    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,\r
+    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,\r
+    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,\r
+    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,\r
+    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,\r
+    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,\r
+    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,\r
+    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,\r
+    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,\r
+    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,\r
+    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,\r
+    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,\r
+    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,\r
+    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,\r
+    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,\r
+    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,\r
+    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,\r
+    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,\r
+    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,\r
+    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,\r
+    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,\r
+    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,\r
+    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,\r
+    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,\r
+    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,\r
+    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,\r
+    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,\r
+    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,\r
+    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,\r
+    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,\r
+    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,\r
+    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,\r
+    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,\r
+    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,\r
+    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,\r
+    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,\r
+    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,\r
+    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,\r
+    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,\r
+    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,\r
+    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,\r
+    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,\r
+    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,\r
+    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,\r
+    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,\r
+    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,\r
+    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,\r
+    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,\r
+    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,\r
+    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,\r
+    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,\r
+    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,\r
+    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,\r
+    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,\r
+    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,\r
+    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,\r
+    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,\r
+};\r
+static const u32 Te1[256] = {\r
+    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,\r
+    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,\r
+    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,\r
+    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,\r
+    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,\r
+    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,\r
+    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,\r
+    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,\r
+    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,\r
+    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,\r
+    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,\r
+    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,\r
+    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,\r
+    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,\r
+    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,\r
+    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,\r
+    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,\r
+    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,\r
+    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,\r
+    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,\r
+    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,\r
+    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,\r
+    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,\r
+    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,\r
+    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,\r
+    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,\r
+    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,\r
+    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,\r
+    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,\r
+    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,\r
+    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,\r
+    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,\r
+    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,\r
+    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,\r
+    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,\r
+    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,\r
+    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,\r
+    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,\r
+    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,\r
+    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,\r
+    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,\r
+    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,\r
+    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,\r
+    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,\r
+    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,\r
+    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,\r
+    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,\r
+    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,\r
+    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,\r
+    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,\r
+    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,\r
+    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,\r
+    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,\r
+    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,\r
+    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,\r
+    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,\r
+    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,\r
+    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,\r
+    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,\r
+    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,\r
+    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,\r
+    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,\r
+    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,\r
+    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,\r
+};\r
+static const u32 Te2[256] = {\r
+    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,\r
+    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,\r
+    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,\r
+    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,\r
+    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,\r
+    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,\r
+    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,\r
+    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,\r
+    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,\r
+    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,\r
+    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,\r
+    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,\r
+    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,\r
+    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,\r
+    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,\r
+    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,\r
+    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,\r
+    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,\r
+    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,\r
+    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,\r
+    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,\r
+    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,\r
+    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,\r
+    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,\r
+    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,\r
+    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,\r
+    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,\r
+    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,\r
+    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,\r
+    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,\r
+    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,\r
+    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,\r
+    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,\r
+    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,\r
+    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,\r
+    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,\r
+    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,\r
+    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,\r
+    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,\r
+    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,\r
+    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,\r
+    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,\r
+    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,\r
+    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,\r
+    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,\r
+    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,\r
+    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,\r
+    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,\r
+    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,\r
+    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,\r
+    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,\r
+    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,\r
+    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,\r
+    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,\r
+    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,\r
+    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,\r
+    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,\r
+    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,\r
+    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,\r
+    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,\r
+    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,\r
+    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,\r
+    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,\r
+    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,\r
+};\r
+static const u32 Te3[256] = {\r
+\r
+    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,\r
+    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,\r
+    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,\r
+    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,\r
+    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,\r
+    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,\r
+    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,\r
+    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,\r
+    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,\r
+    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,\r
+    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,\r
+    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,\r
+    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,\r
+    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,\r
+    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,\r
+    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,\r
+    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,\r
+    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,\r
+    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,\r
+    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,\r
+    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,\r
+    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,\r
+    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,\r
+    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,\r
+    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,\r
+    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,\r
+    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,\r
+    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,\r
+    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,\r
+    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,\r
+    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,\r
+    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,\r
+    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,\r
+    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,\r
+    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,\r
+    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,\r
+    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,\r
+    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,\r
+    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,\r
+    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,\r
+    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,\r
+    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,\r
+    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,\r
+    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,\r
+    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,\r
+    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,\r
+    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,\r
+    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,\r
+    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,\r
+    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,\r
+    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,\r
+    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,\r
+    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,\r
+    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,\r
+    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,\r
+    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,\r
+    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,\r
+    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,\r
+    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,\r
+    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,\r
+    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,\r
+    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,\r
+    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,\r
+    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,\r
+};\r
+static const u32 Te4[256] = {\r
+    0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,\r
+    0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,\r
+    0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,\r
+    0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,\r
+    0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,\r
+    0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,\r
+    0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,\r
+    0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,\r
+    0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,\r
+    0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,\r
+    0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,\r
+    0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,\r
+    0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,\r
+    0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,\r
+    0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,\r
+    0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,\r
+    0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,\r
+    0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,\r
+    0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,\r
+    0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,\r
+    0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,\r
+    0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,\r
+    0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,\r
+    0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,\r
+    0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,\r
+    0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,\r
+    0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,\r
+    0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,\r
+    0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,\r
+    0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,\r
+    0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,\r
+    0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,\r
+    0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,\r
+    0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,\r
+    0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,\r
+    0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,\r
+    0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,\r
+    0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,\r
+    0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,\r
+    0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,\r
+    0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,\r
+    0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,\r
+    0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,\r
+    0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,\r
+    0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,\r
+    0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,\r
+    0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,\r
+    0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,\r
+    0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,\r
+    0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,\r
+    0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,\r
+    0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,\r
+    0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,\r
+    0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,\r
+    0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,\r
+    0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,\r
+    0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,\r
+    0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,\r
+    0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,\r
+    0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,\r
+    0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,\r
+    0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,\r
+    0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,\r
+    0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,\r
+};\r
+static const u32 Td0[256] = {\r
+    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,\r
+    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,\r
+    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,\r
+    0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,\r
+    0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,\r
+    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,\r
+    0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,\r
+    0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,\r
+    0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,\r
+    0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,\r
+    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,\r
+    0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,\r
+    0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,\r
+    0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,\r
+    0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,\r
+    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,\r
+    0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,\r
+    0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,\r
+    0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,\r
+    0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,\r
+    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,\r
+    0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,\r
+    0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,\r
+    0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,\r
+    0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,\r
+    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,\r
+    0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,\r
+    0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,\r
+    0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,\r
+    0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,\r
+    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,\r
+    0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,\r
+    0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,\r
+    0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,\r
+    0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,\r
+    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,\r
+    0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,\r
+    0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,\r
+    0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,\r
+    0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,\r
+    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,\r
+    0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,\r
+    0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,\r
+    0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,\r
+    0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,\r
+    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,\r
+    0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,\r
+    0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,\r
+    0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,\r
+    0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,\r
+    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,\r
+    0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,\r
+    0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,\r
+    0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,\r
+    0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,\r
+    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,\r
+    0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,\r
+    0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,\r
+    0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,\r
+    0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,\r
+    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,\r
+    0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,\r
+    0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,\r
+    0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,\r
+};\r
+static const u32 Td1[256] = {\r
+    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,\r
+    0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,\r
+    0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,\r
+    0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,\r
+    0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,\r
+    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,\r
+    0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,\r
+    0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,\r
+    0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,\r
+    0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,\r
+    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,\r
+    0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,\r
+    0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,\r
+    0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,\r
+    0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,\r
+    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,\r
+    0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,\r
+    0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,\r
+    0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,\r
+    0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,\r
+    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,\r
+    0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,\r
+    0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,\r
+    0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,\r
+    0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,\r
+    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,\r
+    0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,\r
+    0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,\r
+    0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,\r
+    0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,\r
+    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,\r
+    0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,\r
+    0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,\r
+    0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,\r
+    0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,\r
+    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,\r
+    0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,\r
+    0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,\r
+    0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,\r
+    0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,\r
+    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,\r
+    0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,\r
+    0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,\r
+    0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,\r
+    0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,\r
+    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,\r
+    0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,\r
+    0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,\r
+    0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,\r
+    0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,\r
+    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,\r
+    0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,\r
+    0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,\r
+    0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,\r
+    0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,\r
+    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,\r
+    0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,\r
+    0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,\r
+    0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,\r
+    0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,\r
+    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,\r
+    0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,\r
+    0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,\r
+    0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,\r
+};\r
+static const u32 Td2[256] = {\r
+    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,\r
+    0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,\r
+    0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,\r
+    0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,\r
+    0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,\r
+    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,\r
+    0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,\r
+    0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,\r
+    0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,\r
+    0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,\r
+    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,\r
+    0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,\r
+    0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,\r
+    0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,\r
+    0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,\r
+    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,\r
+    0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,\r
+    0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,\r
+    0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,\r
+    0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,\r
+\r
+    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,\r
+    0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,\r
+    0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,\r
+    0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,\r
+    0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,\r
+    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,\r
+    0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,\r
+    0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,\r
+    0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,\r
+    0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,\r
+    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,\r
+    0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,\r
+    0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,\r
+    0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,\r
+    0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,\r
+    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,\r
+    0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,\r
+    0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,\r
+    0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,\r
+    0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,\r
+    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,\r
+    0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,\r
+    0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,\r
+    0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,\r
+    0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,\r
+    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,\r
+    0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,\r
+    0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,\r
+    0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,\r
+    0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,\r
+    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,\r
+    0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,\r
+    0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,\r
+    0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,\r
+    0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,\r
+    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,\r
+    0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,\r
+    0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,\r
+    0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,\r
+    0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,\r
+    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,\r
+    0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,\r
+    0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,\r
+    0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,\r
+};\r
+static const u32 Td3[256] = {\r
+    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,\r
+    0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,\r
+    0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,\r
+    0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,\r
+    0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,\r
+    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,\r
+    0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,\r
+    0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,\r
+    0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,\r
+    0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,\r
+    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,\r
+    0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,\r
+    0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,\r
+    0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,\r
+    0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,\r
+    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,\r
+    0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,\r
+    0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,\r
+    0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,\r
+    0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,\r
+    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,\r
+    0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,\r
+    0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,\r
+    0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,\r
+    0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,\r
+    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,\r
+    0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,\r
+    0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,\r
+    0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,\r
+    0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,\r
+    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,\r
+    0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,\r
+    0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,\r
+    0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,\r
+    0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,\r
+    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,\r
+    0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,\r
+    0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,\r
+    0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,\r
+    0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,\r
+    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,\r
+    0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,\r
+    0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,\r
+    0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,\r
+    0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,\r
+    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,\r
+    0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,\r
+    0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,\r
+    0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,\r
+    0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,\r
+    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,\r
+    0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,\r
+    0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,\r
+    0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,\r
+    0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,\r
+    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,\r
+    0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,\r
+    0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,\r
+    0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,\r
+    0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,\r
+    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,\r
+    0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,\r
+    0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,\r
+    0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,\r
+};\r
+static const u32 Td4[256] = {\r
+    0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,\r
+    0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,\r
+    0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,\r
+    0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,\r
+    0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,\r
+    0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,\r
+    0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,\r
+    0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,\r
+    0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,\r
+    0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,\r
+    0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,\r
+    0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,\r
+    0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,\r
+    0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,\r
+    0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,\r
+    0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,\r
+    0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,\r
+    0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,\r
+    0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,\r
+    0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,\r
+    0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,\r
+    0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,\r
+    0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,\r
+    0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,\r
+    0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,\r
+    0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,\r
+    0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,\r
+    0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,\r
+    0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,\r
+    0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,\r
+    0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,\r
+    0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,\r
+    0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,\r
+    0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,\r
+    0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,\r
+    0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,\r
+    0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,\r
+    0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,\r
+    0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,\r
+    0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,\r
+    0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,\r
+    0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,\r
+    0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,\r
+    0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,\r
+    0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,\r
+    0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,\r
+    0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,\r
+    0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,\r
+    0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,\r
+    0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,\r
+    0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,\r
+    0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,\r
+    0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,\r
+    0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,\r
+    0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,\r
+    0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,\r
+    0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,\r
+    0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,\r
+    0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,\r
+    0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,\r
+    0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,\r
+    0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,\r
+    0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,\r
+    0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU,\r
+};\r
+static const u32 rcon[] = {\r
+       0x01000000, 0x02000000, 0x04000000, 0x08000000,\r
+       0x10000000, 0x20000000, 0x40000000, 0x80000000,\r
+       0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */\r
+};\r
+\r
+/**\r
+ * Expand the cipher key into the encryption key schedule.\r
+ */\r
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,\r
+                       AES_KEY *key) {\r
+\r
+       u32 *rk;\r
+       int i = 0;\r
+       u32 temp;\r
+\r
+       if (!userKey || !key)\r
+               return -1;\r
+       if (bits != 128 && bits != 192 && bits != 256)\r
+               return -2;\r
+\r
+       rk = key->rd_key;\r
+\r
+       if (bits==128)\r
+               key->rounds = 10;\r
+       else if (bits==192)\r
+               key->rounds = 12;\r
+       else\r
+               key->rounds = 14;\r
+\r
+       rk[0] = GETU32(userKey     );\r
+       rk[1] = GETU32(userKey +  4);\r
+       rk[2] = GETU32(userKey +  8);\r
+       rk[3] = GETU32(userKey + 12);\r
+       if (bits == 128) {\r
+               while (1) {\r
+                       temp  = rk[3];\r
+                       rk[4] = rk[0] ^\r
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^\r
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^\r
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^\r
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^\r
+                               rcon[i];\r
+                       rk[5] = rk[1] ^ rk[4];\r
+                       rk[6] = rk[2] ^ rk[5];\r
+                       rk[7] = rk[3] ^ rk[6];\r
+                       if (++i == 10) {\r
+                               return 0;\r
+                       }\r
+                       rk += 4;\r
+               }\r
+       }\r
+       rk[4] = GETU32(userKey + 16);\r
+       rk[5] = GETU32(userKey + 20);\r
+       if (bits == 192) {\r
+               while (1) {\r
+                       temp = rk[ 5];\r
+                       rk[ 6] = rk[ 0] ^\r
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^\r
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^\r
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^\r
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^\r
+                               rcon[i];\r
+                       rk[ 7] = rk[ 1] ^ rk[ 6];\r
+                       rk[ 8] = rk[ 2] ^ rk[ 7];\r
+                       rk[ 9] = rk[ 3] ^ rk[ 8];\r
+                       if (++i == 8) {\r
+                               return 0;\r
+                       }\r
+                       rk[10] = rk[ 4] ^ rk[ 9];\r
+                       rk[11] = rk[ 5] ^ rk[10];\r
+                       rk += 6;\r
+               }\r
+       }\r
+       rk[6] = GETU32(userKey + 24);\r
+       rk[7] = GETU32(userKey + 28);\r
+       if (bits == 256) {\r
+               while (1) {\r
+                       temp = rk[ 7];\r
+                       rk[ 8] = rk[ 0] ^\r
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^\r
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^\r
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^\r
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^\r
+                               rcon[i];\r
+                       rk[ 9] = rk[ 1] ^ rk[ 8];\r
+                       rk[10] = rk[ 2] ^ rk[ 9];\r
+                       rk[11] = rk[ 3] ^ rk[10];\r
+                       if (++i == 7) {\r
+                               return 0;\r
+                       }\r
+                       temp = rk[11];\r
+                       rk[12] = rk[ 4] ^\r
+                               (Te4[(temp >> 24)       ] & 0xff000000) ^\r
+                               (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^\r
+                               (Te4[(temp >>  8) & 0xff] & 0x0000ff00) ^\r
+                               (Te4[(temp      ) & 0xff] & 0x000000ff);\r
+                       rk[13] = rk[ 5] ^ rk[12];\r
+                       rk[14] = rk[ 6] ^ rk[13];\r
+                       rk[15] = rk[ 7] ^ rk[14];\r
+\r
+                       rk += 8;\r
+               }\r
+       }\r
+       return 0;\r
+}\r
+\r
+/**\r
+ * Expand the cipher key into the decryption key schedule.\r
+ */\r
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,\r
+                        AES_KEY *key) {\r
+\r
+        u32 *rk;\r
+       int i, j, status;\r
+       u32 temp;\r
+\r
+       /* first, start with an encryption schedule */\r
+       status = AES_set_encrypt_key(userKey, bits, key);\r
+       if (status < 0)\r
+               return status;\r
+\r
+       rk = key->rd_key;\r
+\r
+       /* invert the order of the round keys: */\r
+       for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {\r
+               temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;\r
+               temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;\r
+               temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;\r
+               temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;\r
+       }\r
+       /* apply the inverse MixColumn transform to all round keys but the first and the last: */\r
+       for (i = 1; i < (key->rounds); i++) {\r
+               rk += 4;\r
+               rk[0] =\r
+                       Td0[Te4[(rk[0] >> 24)       ] & 0xff] ^\r
+                       Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^\r
+                       Td2[Te4[(rk[0] >>  8) & 0xff] & 0xff] ^\r
+                       Td3[Te4[(rk[0]      ) & 0xff] & 0xff];\r
+               rk[1] =\r
+                       Td0[Te4[(rk[1] >> 24)       ] & 0xff] ^\r
+                       Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^\r
+                       Td2[Te4[(rk[1] >>  8) & 0xff] & 0xff] ^\r
+                       Td3[Te4[(rk[1]      ) & 0xff] & 0xff];\r
+               rk[2] =\r
+                       Td0[Te4[(rk[2] >> 24)       ] & 0xff] ^\r
+                       Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^\r
+                       Td2[Te4[(rk[2] >>  8) & 0xff] & 0xff] ^\r
+                       Td3[Te4[(rk[2]      ) & 0xff] & 0xff];\r
+               rk[3] =\r
+                       Td0[Te4[(rk[3] >> 24)       ] & 0xff] ^\r
+                       Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^\r
+                       Td2[Te4[(rk[3] >>  8) & 0xff] & 0xff] ^\r
+                       Td3[Te4[(rk[3]      ) & 0xff] & 0xff];\r
+       }\r
+       return 0;\r
+}\r
+\r
+#ifndef AES_ASM\r
+/*\r
+ * Encrypt a single block\r
+ * in and out can overlap\r
+ */\r
+void AES_encrypt(const unsigned char *in, unsigned char *out,\r
+                const AES_KEY *key) {\r
+\r
+       const u32 *rk;\r
+       u32 s0, s1, s2, s3, t0, t1, t2, t3;\r
+#ifndef FULL_UNROLL\r
+       int r;\r
+#endif /* ?FULL_UNROLL */\r
+\r
+       assert(in && out && key);\r
+       rk = key->rd_key;\r
+\r
+       /*\r
+        * map byte array block to cipher state\r
+        * and add initial round key:\r
+        */\r
+       s0 = GETU32(in     ) ^ rk[0];\r
+       s1 = GETU32(in +  4) ^ rk[1];\r
+       s2 = GETU32(in +  8) ^ rk[2];\r
+       s3 = GETU32(in + 12) ^ rk[3];\r
+#ifdef FULL_UNROLL\r
+       /* round 1: */\r
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];\r
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];\r
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];\r
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];\r
+       /* round 2: */\r
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];\r
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];\r
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];\r
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];\r
+       /* round 3: */\r
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];\r
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];\r
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];\r
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];\r
+       /* round 4: */\r
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];\r
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];\r
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];\r
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];\r
+       /* round 5: */\r
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];\r
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];\r
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];\r
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];\r
+       /* round 6: */\r
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];\r
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];\r
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];\r
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];\r
+       /* round 7: */\r
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];\r
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];\r
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];\r
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];\r
+       /* round 8: */\r
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];\r
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];\r
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];\r
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];\r
+       /* round 9: */\r
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];\r
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];\r
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];\r
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];\r
+    if (key->rounds > 10) {\r
+        /* round 10: */\r
+        s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];\r
+        s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];\r
+        s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];\r
+        s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];\r
+        /* round 11: */\r
+        t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];\r
+        t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];\r
+        t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];\r
+        t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];\r
+        if (key->rounds > 12) {\r
+            /* round 12: */\r
+            s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];\r
+            s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];\r
+            s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];\r
+            s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];\r
+            /* round 13: */\r
+            t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];\r
+            t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];\r
+            t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];\r
+            t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];\r
+        }\r
+    }\r
+    rk += key->rounds << 2;\r
+#else  /* !FULL_UNROLL */\r
+    /*\r
+     * Nr - 1 full rounds:\r
+     */\r
+    r = key->rounds >> 1;\r
+    for (;;) {\r
+        t0 =\r
+            Te0[(s0 >> 24)       ] ^\r
+            Te1[(s1 >> 16) & 0xff] ^\r
+            Te2[(s2 >>  8) & 0xff] ^\r
+            Te3[(s3      ) & 0xff] ^\r
+            rk[4];\r
+        t1 =\r
+            Te0[(s1 >> 24)       ] ^\r
+            Te1[(s2 >> 16) & 0xff] ^\r
+            Te2[(s3 >>  8) & 0xff] ^\r
+            Te3[(s0      ) & 0xff] ^\r
+            rk[5];\r
+        t2 =\r
+            Te0[(s2 >> 24)       ] ^\r
+            Te1[(s3 >> 16) & 0xff] ^\r
+            Te2[(s0 >>  8) & 0xff] ^\r
+            Te3[(s1      ) & 0xff] ^\r
+            rk[6];\r
+        t3 =\r
+            Te0[(s3 >> 24)       ] ^\r
+            Te1[(s0 >> 16) & 0xff] ^\r
+            Te2[(s1 >>  8) & 0xff] ^\r
+            Te3[(s2      ) & 0xff] ^\r
+            rk[7];\r
+\r
+        rk += 8;\r
+        if (--r == 0) {\r
+            break;\r
+        }\r
+\r
+        s0 =\r
+            Te0[(t0 >> 24)       ] ^\r
+            Te1[(t1 >> 16) & 0xff] ^\r
+            Te2[(t2 >>  8) & 0xff] ^\r
+            Te3[(t3      ) & 0xff] ^\r
+            rk[0];\r
+        s1 =\r
+            Te0[(t1 >> 24)       ] ^\r
+            Te1[(t2 >> 16) & 0xff] ^\r
+            Te2[(t3 >>  8) & 0xff] ^\r
+            Te3[(t0      ) & 0xff] ^\r
+            rk[1];\r
+        s2 =\r
+            Te0[(t2 >> 24)       ] ^\r
+            Te1[(t3 >> 16) & 0xff] ^\r
+            Te2[(t0 >>  8) & 0xff] ^\r
+            Te3[(t1      ) & 0xff] ^\r
+            rk[2];\r
+        s3 =\r
+            Te0[(t3 >> 24)       ] ^\r
+            Te1[(t0 >> 16) & 0xff] ^\r
+            Te2[(t1 >>  8) & 0xff] ^\r
+            Te3[(t2      ) & 0xff] ^\r
+            rk[3];\r
+    }\r
+#endif /* ?FULL_UNROLL */\r
+    /*\r
+        * apply last round and\r
+        * map cipher state to byte array block:\r
+        */\r
+       s0 =\r
+               (Te4[(t0 >> 24)       ] & 0xff000000) ^\r
+               (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Te4[(t2 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Te4[(t3      ) & 0xff] & 0x000000ff) ^\r
+               rk[0];\r
+       PUTU32(out     , s0);\r
+       s1 =\r
+               (Te4[(t1 >> 24)       ] & 0xff000000) ^\r
+               (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Te4[(t3 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Te4[(t0      ) & 0xff] & 0x000000ff) ^\r
+               rk[1];\r
+       PUTU32(out +  4, s1);\r
+       s2 =\r
+               (Te4[(t2 >> 24)       ] & 0xff000000) ^\r
+               (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Te4[(t0 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Te4[(t1      ) & 0xff] & 0x000000ff) ^\r
+               rk[2];\r
+       PUTU32(out +  8, s2);\r
+       s3 =\r
+               (Te4[(t3 >> 24)       ] & 0xff000000) ^\r
+               (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Te4[(t1 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Te4[(t2      ) & 0xff] & 0x000000ff) ^\r
+               rk[3];\r
+       PUTU32(out + 12, s3);\r
+}\r
+\r
+/*\r
+ * Decrypt a single block\r
+ * in and out can overlap\r
+ */\r
+void AES_decrypt(const unsigned char *in, unsigned char *out,\r
+                const AES_KEY *key) {\r
+\r
+       const u32 *rk;\r
+       u32 s0, s1, s2, s3, t0, t1, t2, t3;\r
+#ifndef FULL_UNROLL\r
+       int r;\r
+#endif /* ?FULL_UNROLL */\r
+\r
+       assert(in && out && key);\r
+       rk = key->rd_key;\r
+\r
+       /*\r
+        * map byte array block to cipher state\r
+        * and add initial round key:\r
+        */\r
+    s0 = GETU32(in     ) ^ rk[0];\r
+    s1 = GETU32(in +  4) ^ rk[1];\r
+    s2 = GETU32(in +  8) ^ rk[2];\r
+    s3 = GETU32(in + 12) ^ rk[3];\r
+#ifdef FULL_UNROLL\r
+    /* round 1: */\r
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];\r
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];\r
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];\r
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];\r
+    /* round 2: */\r
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];\r
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];\r
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];\r
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];\r
+    /* round 3: */\r
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];\r
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];\r
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];\r
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];\r
+    /* round 4: */\r
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];\r
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];\r
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];\r
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];\r
+    /* round 5: */\r
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];\r
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];\r
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];\r
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];\r
+    /* round 6: */\r
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];\r
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];\r
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];\r
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];\r
+    /* round 7: */\r
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];\r
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];\r
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];\r
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];\r
+    /* round 8: */\r
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];\r
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];\r
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];\r
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];\r
+    /* round 9: */\r
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];\r
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];\r
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];\r
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];\r
+    if (key->rounds > 10) {\r
+        /* round 10: */\r
+        s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];\r
+        s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];\r
+        s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];\r
+        s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];\r
+        /* round 11: */\r
+        t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];\r
+        t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];\r
+        t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];\r
+        t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];\r
+        if (key->rounds > 12) {\r
+            /* round 12: */\r
+            s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];\r
+            s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];\r
+            s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];\r
+            s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];\r
+            /* round 13: */\r
+            t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];\r
+            t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];\r
+            t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];\r
+            t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];\r
+        }\r
+    }\r
+       rk += key->rounds << 2;\r
+#else  /* !FULL_UNROLL */\r
+    /*\r
+     * Nr - 1 full rounds:\r
+     */\r
+    r = key->rounds >> 1;\r
+    for (;;) {\r
+        t0 =\r
+            Td0[(s0 >> 24)       ] ^\r
+            Td1[(s3 >> 16) & 0xff] ^\r
+            Td2[(s2 >>  8) & 0xff] ^\r
+            Td3[(s1      ) & 0xff] ^\r
+            rk[4];\r
+        t1 =\r
+            Td0[(s1 >> 24)       ] ^\r
+            Td1[(s0 >> 16) & 0xff] ^\r
+            Td2[(s3 >>  8) & 0xff] ^\r
+            Td3[(s2      ) & 0xff] ^\r
+            rk[5];\r
+        t2 =\r
+            Td0[(s2 >> 24)       ] ^\r
+            Td1[(s1 >> 16) & 0xff] ^\r
+            Td2[(s0 >>  8) & 0xff] ^\r
+            Td3[(s3      ) & 0xff] ^\r
+            rk[6];\r
+        t3 =\r
+            Td0[(s3 >> 24)       ] ^\r
+            Td1[(s2 >> 16) & 0xff] ^\r
+            Td2[(s1 >>  8) & 0xff] ^\r
+            Td3[(s0      ) & 0xff] ^\r
+            rk[7];\r
+\r
+        rk += 8;\r
+        if (--r == 0) {\r
+            break;\r
+        }\r
+\r
+        s0 =\r
+            Td0[(t0 >> 24)       ] ^\r
+            Td1[(t3 >> 16) & 0xff] ^\r
+            Td2[(t2 >>  8) & 0xff] ^\r
+            Td3[(t1      ) & 0xff] ^\r
+            rk[0];\r
+        s1 =\r
+            Td0[(t1 >> 24)       ] ^\r
+            Td1[(t0 >> 16) & 0xff] ^\r
+            Td2[(t3 >>  8) & 0xff] ^\r
+            Td3[(t2      ) & 0xff] ^\r
+            rk[1];\r
+        s2 =\r
+            Td0[(t2 >> 24)       ] ^\r
+            Td1[(t1 >> 16) & 0xff] ^\r
+            Td2[(t0 >>  8) & 0xff] ^\r
+            Td3[(t3      ) & 0xff] ^\r
+            rk[2];\r
+        s3 =\r
+            Td0[(t3 >> 24)       ] ^\r
+            Td1[(t2 >> 16) & 0xff] ^\r
+            Td2[(t1 >>  8) & 0xff] ^\r
+            Td3[(t0      ) & 0xff] ^\r
+            rk[3];\r
+    }\r
+#endif /* ?FULL_UNROLL */\r
+    /*\r
+        * apply last round and\r
+        * map cipher state to byte array block:\r
+        */\r
+       s0 =\r
+               (Td4[(t0 >> 24)       ] & 0xff000000) ^\r
+               (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Td4[(t2 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Td4[(t1      ) & 0xff] & 0x000000ff) ^\r
+               rk[0];\r
+       PUTU32(out     , s0);\r
+       s1 =\r
+               (Td4[(t1 >> 24)       ] & 0xff000000) ^\r
+               (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Td4[(t3 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Td4[(t2      ) & 0xff] & 0x000000ff) ^\r
+               rk[1];\r
+       PUTU32(out +  4, s1);\r
+       s2 =\r
+               (Td4[(t2 >> 24)       ] & 0xff000000) ^\r
+               (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Td4[(t0 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Td4[(t3      ) & 0xff] & 0x000000ff) ^\r
+               rk[2];\r
+       PUTU32(out +  8, s2);\r
+       s3 =\r
+               (Td4[(t3 >> 24)       ] & 0xff000000) ^\r
+               (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Td4[(t1 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Td4[(t0      ) & 0xff] & 0x000000ff) ^\r
+               rk[3];\r
+       PUTU32(out + 12, s3);\r
+}\r
+\r
+#endif /* AES_ASM */\r
+\r
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,\r
+                    const unsigned long length, const AES_KEY *key,\r
+                    unsigned char *ivec, const int enc) \r
+{\r
+\r
+       unsigned long n;\r
+       unsigned long len = length;\r
+       unsigned char tmp[AES_BLOCK_SIZE];\r
+\r
+       assert(in && out && key && ivec);\r
+\r
+       if (enc) {\r
+               while (len >= AES_BLOCK_SIZE) {\r
+                       for(n=0; n < AES_BLOCK_SIZE; ++n)\r
+                               tmp[n] = in[n] ^ ivec[n];\r
+                       AES_encrypt(tmp, out, key);\r
+                       memcpy(ivec, out, AES_BLOCK_SIZE);\r
+                       len -= AES_BLOCK_SIZE;\r
+                       in += AES_BLOCK_SIZE;\r
+                       out += AES_BLOCK_SIZE;\r
+               }\r
+               if (len) {\r
+                       for(n=0; n < len; ++n)\r
+                               tmp[n] = in[n] ^ ivec[n];\r
+                       for(n=len; n < AES_BLOCK_SIZE; ++n)\r
+                               tmp[n] = ivec[n];\r
+                       AES_encrypt(tmp, tmp, key);\r
+                       memcpy(out, tmp, AES_BLOCK_SIZE);\r
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);\r
+               }                       \r
+       } else {\r
+               while (len >= AES_BLOCK_SIZE) {\r
+                       memcpy(tmp, in, AES_BLOCK_SIZE);\r
+                       AES_decrypt(in, out, key);\r
+                       for(n=0; n < AES_BLOCK_SIZE; ++n)\r
+                               out[n] ^= ivec[n];\r
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);\r
+                       len -= AES_BLOCK_SIZE;\r
+                       in += AES_BLOCK_SIZE;\r
+                       out += AES_BLOCK_SIZE;\r
+               }\r
+               if (len) {\r
+                       memcpy(tmp, in, AES_BLOCK_SIZE);\r
+                       AES_decrypt(tmp, tmp, key);\r
+                       for(n=0; n < len; ++n)\r
+                               out[n] = tmp[n] ^ ivec[n];\r
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);\r
+               }                       \r
+       }\r
+}\r
diff --git a/tools/blktap2/drivers/aes.h b/tools/blktap2/drivers/aes.h
new file mode 100644 (file)
index 0000000..9fb54a9
--- /dev/null
@@ -0,0 +1,28 @@
+#ifndef QEMU_AES_H
+#define QEMU_AES_H
+
+#include <stdint.h>
+
+#define AES_MAXNR 14
+#define AES_BLOCK_SIZE 16
+
+struct aes_key_st {
+    uint32_t rd_key[4 *(AES_MAXNR + 1)];
+    int rounds;
+};
+typedef struct aes_key_st AES_KEY;
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+       AES_KEY *key);
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+       AES_KEY *key);
+
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+       const AES_KEY *key);
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+       const AES_KEY *key);
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                    const unsigned long length, const AES_KEY *key,
+                    unsigned char *ivec, const int enc);
+
+#endif
diff --git a/tools/blktap2/drivers/atomicio.c b/tools/blktap2/drivers/atomicio.c
new file mode 100644 (file)
index 0000000..ae0e24b
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved.
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t
+atomicio(f, fd, _s, n)
+       ssize_t (*f) (int, void *, size_t);
+       int fd;
+       void *_s;
+       size_t n;
+{
+       char *s = _s;
+       size_t pos = 0;
+       ssize_t res;
+
+       while (n > pos) {
+               res = (f) (fd, s + pos, n - pos);
+               switch (res) {
+               case -1:
+                       if (errno == EINTR || errno == EAGAIN)
+                               continue;
+                       return 0;
+               case 0:
+                       errno = EPIPE;
+                       return pos;
+               default:
+                       pos += (size_t)res;
+               }
+       }
+       return (pos);
+}
+
diff --git a/tools/blktap2/drivers/blk.h b/tools/blktap2/drivers/blk.h
new file mode 100644 (file)
index 0000000..394ae49
--- /dev/null
@@ -0,0 +1,36 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <inttypes.h>
+
+int blk_getimagesize(int fd, uint64_t *size);
+int blk_getsectorsize(int fd, uint64_t *sector_size);
+
+#ifndef O_LARGEFILE
+#define O_LARGEFILE    0
+#endif
diff --git a/tools/blktap2/drivers/blk_linux.c b/tools/blktap2/drivers/blk_linux.c
new file mode 100644 (file)
index 0000000..85763ac
--- /dev/null
@@ -0,0 +1,43 @@
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include "tapdisk.h"
+#include "blk.h"
+
+int blk_getimagesize(int fd, uint64_t *size)
+{
+       int rc;
+
+       *size = 0;
+       rc = ioctl(fd, BLKGETSIZE, size);
+       if (rc) {
+               DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int blk_getsectorsize(int fd, uint64_t *sector_size)
+{
+#if defined(BLKSSZGET)
+       int rc;
+
+       *sector_size = DEFAULT_SECTOR_SIZE;
+       rc = ioctl(fd, BLKSSZGET, sector_size);
+       if (rc) {
+               DPRINTF("ERR: BLKSSZGET failed. Falling back to use default sector size");
+               *sector_size = DEFAULT_SECTOR_SIZE;
+       }
+
+       if (*sector_size != DEFAULT_SECTOR_SIZE)
+               DPRINTF("Note: sector size is %"PRIu64" (not %u)\n",
+                       *sector_size, DEFAULT_SECTOR_SIZE);
+#else
+       *sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       return 0;
+}
+
diff --git a/tools/blktap2/drivers/blk_netbsd.c b/tools/blktap2/drivers/blk_netbsd.c
new file mode 100644 (file)
index 0000000..f394fdf
--- /dev/null
@@ -0,0 +1,41 @@
+#include <sys/param.h>
+#include <sys/ioctl.h>
+#include <sys/disklabel.h>
+#include <errno.h>
+#include <inttypes.h>
+#include "tapdisk.h"
+#include "blk.h"
+
+int blk_getimagesize(int fd, uint64_t *size)
+{
+       int rc;
+       struct disklabel dl;
+
+       *size = 0;
+       rc = ioctl(fd, DIOCGDINFO, &dl);
+       if (rc) {
+               DPRINTF("ERR: DIOCGDINFO failed, couldn't stat image");
+               return -EINVAL;
+       }
+
+       *size = dl.d_secsize * dl.d_secpercyl;
+
+       return 0;
+}
+
+int blk_getsectorsize(int fd, uint64_t *sector_size)
+{
+       int rc;
+       struct disklabel dl;
+
+       *sector_size = DEV_BSIZE;
+       rc = ioctl(fd, DIOCGDINFO, &dl);
+       if (rc) {
+               DPRINTF("ERR: DIOCGDINFO failed, couldn't stat image");
+               return 0; /* fallback to DEV_BSIZE */
+       }
+
+       *sector_size = dl.d_secsize;
+       return 0;
+}
+
diff --git a/tools/blktap2/drivers/block-aio.c b/tools/blktap2/drivers/block-aio.c
new file mode 100644 (file)
index 0000000..f398da2
--- /dev/null
@@ -0,0 +1,258 @@
+/* 
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#include "blk.h"
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_AIO_REQS         TAPDISK_DATA_REQUESTS
+
+struct tdaio_state;
+
+struct aio_request {
+       td_request_t         treq;
+       struct tiocb         tiocb;
+       struct tdaio_state  *state;
+};
+
+struct tdaio_state {
+       int                  fd;
+       td_driver_t         *driver;
+
+       int                  aio_free_count;    
+       struct aio_request   aio_requests[MAX_AIO_REQS];
+       struct aio_request  *aio_free_list[MAX_AIO_REQS];
+};
+
+/*Get Image size, secsize*/
+static int tdaio_get_image_info(int fd, td_disk_info_t *info)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               info->size = 0;
+               if (blk_getimagesize(fd, &info->size) != 0)
+                       return -EINVAL;
+
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+
+               /*Get the sector size*/
+               if (blk_getsectorsize(fd, &info->sector_size) != 0)
+                       info->sector_size = DEFAULT_SECTOR_SIZE;
+
+       } else {
+               /*Local file? try fstat instead*/
+               info->size = (stat.st_size >> SECTOR_SHIFT);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+       }
+
+       if (info->size == 0) {          
+               info->size =((uint64_t) 16836057);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+       }
+       info->info = 0;
+
+       return 0;
+}
+
+/* Open the disk file and initialize aio state. */
+int tdaio_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       int i, fd, ret, o_flags;
+       struct tdaio_state *prv;
+
+       ret = 0;
+       prv = (struct tdaio_state *)driver->data;
+
+       DPRINTF("block-aio open('%s')", name);
+
+       memset(prv, 0, sizeof(struct tdaio_state));
+
+       prv->aio_free_count = MAX_AIO_REQS;
+       for (i = 0; i < MAX_AIO_REQS; i++)
+               prv->aio_free_list[i] = &prv->aio_requests[i];
+
+       /* Open the file */
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags & TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+        fd = open(name, o_flags);
+
+        if ( (fd == -1) && (errno == EINVAL) ) {
+
+                /* Maybe O_DIRECT isn't supported. */
+               o_flags &= ~O_DIRECT;
+                fd = open(name, o_flags);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s] (%d)!\n", name, 0 - errno);
+               ret = 0 - errno;
+               goto done;
+        }
+
+       ret = tdaio_get_image_info(fd, &driver->info);
+       if (ret) {
+               close(fd);
+               goto done;
+       }
+
+        prv->fd = fd;
+
+done:
+       return ret;     
+}
+
+void tdaio_complete(void *arg, struct tiocb *tiocb, int err)
+{
+       struct aio_request *aio = (struct aio_request *)arg;
+       struct tdaio_state *prv = aio->state;
+
+       td_complete_request(aio->treq, err);
+       prv->aio_free_list[prv->aio_free_count++] = aio;
+}
+
+void tdaio_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct aio_request *aio;
+       struct tdaio_state *prv;
+
+       prv    = (struct tdaio_state *)driver->data;
+       size   = treq.secs * driver->info.sector_size;
+       offset = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_read(&aio->tiocb, prv->fd, treq.buf,
+                    size, offset, tdaio_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+void tdaio_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct aio_request *aio;
+       struct tdaio_state *prv;
+
+       prv     = (struct tdaio_state *)driver->data;
+       size    = treq.secs * driver->info.sector_size;
+       offset  = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_write(&aio->tiocb, prv->fd, treq.buf,
+                     size, offset, tdaio_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+int tdaio_close(td_driver_t *driver)
+{
+       struct tdaio_state *prv = (struct tdaio_state *)driver->data;
+       
+       close(prv->fd);
+
+       return 0;
+}
+
+int tdaio_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       return TD_NO_PARENT;
+}
+
+int tdaio_validate_parent(td_driver_t *driver,
+                         td_driver_t *pdriver, td_flag_t flags)
+{
+       return -EINVAL;
+}
+
+struct tap_disk tapdisk_aio = {
+       .disk_type          = "tapdisk_aio",
+       .flags              = 0,
+       .private_data_size  = sizeof(struct tdaio_state),
+       .td_open            = tdaio_open,
+       .td_close           = tdaio_close,
+       .td_queue_read      = tdaio_queue_read,
+       .td_queue_write     = tdaio_queue_write,
+       .td_get_parent_id   = tdaio_get_parent_id,
+       .td_validate_parent = tdaio_validate_parent,
+       .td_debug           = NULL,
+};
diff --git a/tools/blktap2/drivers/block-cache.c b/tools/blktap2/drivers/block-cache.c
new file mode 100644 (file)
index 0000000..1d2f4eb
--- /dev/null
@@ -0,0 +1,787 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "tapdisk.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+#ifdef DEBUG
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#else
+#define DBG(_f, _a...) ((void)0)
+#endif
+
+#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
+
+#define RADIX_TREE_PAGE_SHIFT           12 /* 4K pages */
+#define RADIX_TREE_PAGE_SIZE            (1 << RADIX_TREE_PAGE_SHIFT)
+
+#define RADIX_TREE_NODE_SHIFT           9 /* 512B nodes */
+#define RADIX_TREE_NODE_SIZE            (1 << RADIX_TREE_NODE_SHIFT)
+#define RADIX_TREE_NODE_MASK            (RADIX_TREE_NODE_SIZE - 1)
+
+#define BLOCK_CACHE_NODES_PER_PAGE      (1 << (RADIX_TREE_PAGE_SHIFT - RADIX_TREE_NODE_SHIFT))
+
+#define BLOCK_CACHE_MAX_SIZE            (10 << 20) /* 100MB cache */
+#define BLOCK_CACHE_REQUESTS            (TAPDISK_DATA_REQUESTS << 3)
+#define BLOCK_CACHE_PAGE_IDLETIME       60
+
+typedef struct radix_tree               radix_tree_t;
+typedef struct radix_tree_node          radix_tree_node_t;
+typedef struct radix_tree_link          radix_tree_link_t;
+typedef struct radix_tree_leaf          radix_tree_leaf_t;
+typedef struct radix_tree_page          radix_tree_page_t;
+
+typedef struct block_cache              block_cache_t;
+typedef struct block_cache_request      block_cache_request_t;
+typedef struct block_cache_stats        block_cache_stats_t;
+
+struct radix_tree_page {
+       char                           *buf;
+       size_t                          size;
+       uint64_t                        sec;
+       radix_tree_link_t              *owners[BLOCK_CACHE_NODES_PER_PAGE];
+};
+
+struct radix_tree_leaf {
+       radix_tree_page_t              *page;
+       char                           *buf;
+};
+
+struct radix_tree_link {
+       uint32_t                        time;
+       union {
+               radix_tree_node_t      *next;
+               radix_tree_leaf_t       leaf;
+       } u;
+};
+
+struct radix_tree_node {
+       int                             height;
+       radix_tree_link_t               links[RADIX_TREE_NODE_SIZE];
+};
+
+struct radix_tree {
+       int                             height;
+       uint64_t                        size;
+       uint32_t                        nodes;
+       radix_tree_node_t              *root;
+
+       block_cache_t                  *cache;
+};
+
+struct block_cache_request {
+       int                             err;
+       char                           *buf;
+       uint64_t                        secs;
+       td_request_t                    treq;
+       block_cache_t                  *cache;
+};
+
+struct block_cache_stats {
+       uint64_t                        reads;
+       uint64_t                        hits;
+       uint64_t                        misses;
+       uint64_t                        prunes;
+};
+
+struct block_cache {
+       int                             ptype;
+       char                           *name;
+
+       uint64_t                        sectors;
+
+       block_cache_request_t           requests[BLOCK_CACHE_REQUESTS];
+       block_cache_request_t          *request_free_list[BLOCK_CACHE_REQUESTS];
+       int                             requests_free;
+
+       event_id_t                      timeout_id;
+
+       radix_tree_t                    tree;
+
+       block_cache_stats_t             stats;
+};
+
+static inline uint64_t
+radix_tree_calculate_size(int height)
+{
+       return (uint64_t)RADIX_TREE_NODE_SIZE <<
+         (height * RADIX_TREE_NODE_SHIFT);
+}
+
+static inline int
+radix_tree_calculate_height(uint64_t sectors)
+{
+       int height;
+       uint64_t tree_size;
+
+       height = 1;  /* always allocate root node */
+       tree_size = radix_tree_calculate_size(height);
+       while (sectors > tree_size)
+               tree_size = radix_tree_calculate_size(++height);
+
+       return height;
+}
+
+static inline int
+radix_tree_index(radix_tree_node_t *node, uint64_t sector)
+{
+       return ((sector >> (node->height * RADIX_TREE_NODE_SHIFT)) &
+               RADIX_TREE_NODE_MASK);
+}
+
+static inline int
+radix_tree_node_contains_leaves(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       return (node->height == 0);
+}
+
+static inline int
+radix_tree_node_is_root(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       return (node->height == tree->height);
+}
+
+static inline uint64_t
+radix_tree_size(radix_tree_t *tree)
+{
+       return tree->size + tree->nodes * sizeof(radix_tree_node_t);
+}
+
+static inline void
+radix_tree_clear_link(radix_tree_link_t *link)
+{
+       if (link)
+               memset(link, 0, sizeof(radix_tree_link_t));
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_node(radix_tree_t *tree, int height)
+{
+       radix_tree_node_t *node;
+
+       node = calloc(1, sizeof(radix_tree_node_t));
+       if (!node)
+               return NULL;
+
+       node->height = height;
+       tree->nodes++;
+
+       return node;
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_child_node(radix_tree_t *tree, radix_tree_node_t *parent)
+{
+       return radix_tree_allocate_node(tree, parent->height - 1);
+}
+
+void
+radix_tree_free_node(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       if (!node)
+               return;
+
+       free(node);
+       tree->nodes--;
+}
+
+static inline radix_tree_page_t *
+radix_tree_allocate_page(radix_tree_t *tree,
+                        char *buf, uint64_t sec, size_t size)
+{
+       radix_tree_page_t *page;
+
+       page = calloc(1, sizeof(radix_tree_page_t));
+       if (!page)
+               return NULL;
+
+       page->buf   = buf;
+       page->sec   = sec;
+       page->size  = size;
+       tree->size += size;
+
+       return page;
+}
+
+static inline void
+radix_tree_free_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+       int i;
+
+       for (i = 0; i < page->size >> RADIX_TREE_NODE_SHIFT; i++)
+               DBG("%s: ejecting sector 0x%llx\n",
+                   tree->cache->name, page->sec + i);
+
+       tree->cache->stats.prunes += (page->size >> RADIX_TREE_NODE_SHIFT);
+       tree->size -= page->size;
+       free(page->buf);
+       free(page);
+}
+
+/*
+ * remove a leaf and the shared radix_tree_page_t containing its buffer.
+ * leaves are deleted, nodes are not; gc will reap the nodes later.
+ */
+static void
+radix_tree_remove_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+       int i;
+
+       if (!page)
+               return;
+
+       for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++)
+               radix_tree_clear_link(page->owners[i]);
+
+       radix_tree_free_page(tree, page);
+}
+
+static void
+radix_tree_insert_leaf(radix_tree_t *tree, radix_tree_link_t *link,
+                      radix_tree_page_t *page, off_t off)
+{
+       int i;
+
+       if (off + RADIX_TREE_NODE_SIZE > page->size)
+               return;
+
+       for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) {
+               if (page->owners[i])
+                       continue;
+
+               page->owners[i]   = link;
+               link->u.leaf.page = page;
+               link->u.leaf.buf  = page->buf + off;
+
+               break;
+       }
+}
+
+static char *
+radix_tree_find_leaf(radix_tree_t *tree, uint64_t sector)
+{
+       int idx;
+       struct timeval now;
+       radix_tree_link_t *link;
+       radix_tree_node_t *node;
+
+       node = tree->root;
+       gettimeofday(&now, NULL);
+
+       do {
+               idx        = radix_tree_index(node, sector);
+               link       = node->links + idx;
+               link->time = now.tv_sec;
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       return link->u.leaf.buf;
+
+               if (!link->u.next)
+                       return NULL;
+
+               node = link->u.next;
+       } while (1);
+}
+
+static char *
+radix_tree_add_leaf(radix_tree_t *tree, uint64_t sector,
+                   radix_tree_page_t *page, off_t off)
+{
+       int idx;
+       struct timeval now;
+       radix_tree_link_t *link;
+       radix_tree_node_t *node;
+
+       node = tree->root;
+       gettimeofday(&now, NULL);
+
+       do {
+               idx        = radix_tree_index(node, sector);
+               link       = node->links + idx;
+               link->time = now.tv_sec;
+
+               if (radix_tree_node_contains_leaves(tree, node)) {
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+                       radix_tree_insert_leaf(tree, link, page, off);
+                       return link->u.leaf.buf;
+               }
+
+               if (!link->u.next) {
+                       link->u.next = radix_tree_allocate_child_node(tree,
+                                                                     node);
+                       if (!link->u.next)
+                               return NULL;
+               }
+
+               node = link->u.next;
+       } while (1);
+}
+
+static int
+radix_tree_add_leaves(radix_tree_t *tree, char *buf,
+                     uint64_t sector, uint64_t sectors)
+{
+       int i;
+       radix_tree_page_t *page;
+
+       page = radix_tree_allocate_page(tree, buf, sector,
+                                       sectors << RADIX_TREE_NODE_SHIFT);
+       if (!page)
+               return -ENOMEM;
+
+       for (i = 0; i < sectors; i++)
+               if (!radix_tree_add_leaf(tree, sector + i, 
+                                        page, (i << RADIX_TREE_NODE_SHIFT)))
+                       goto fail;
+
+       return 0;
+
+fail:
+       page->buf = NULL;
+       radix_tree_remove_page(tree, page);
+       return -ENOMEM;
+}
+
+static void
+radix_tree_delete_branch(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       int i;
+       radix_tree_link_t *link;
+
+       if (!node)
+               return;
+
+       for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+               link = node->links + i;
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+               else
+                       radix_tree_delete_branch(tree, link->u.next);
+
+               radix_tree_clear_link(link);
+       }
+
+       radix_tree_free_node(tree, node);
+}
+
+static inline void
+radix_tree_destroy(radix_tree_t *tree)
+{
+       radix_tree_delete_branch(tree, tree->root);
+       tree->root = NULL;
+}
+
+/*
+ * returns 1 if @node is empty after pruning, 0 otherwise
+ */
+static int
+radix_tree_prune_branch(radix_tree_t *tree,
+                       radix_tree_node_t *node, uint32_t now)
+{
+       int i, empty;
+       radix_tree_link_t *link;
+
+       empty = 1;
+       if (!node)
+               return empty;
+
+       for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+               link = node->links + i;
+
+               if (now - link->time < BLOCK_CACHE_PAGE_IDLETIME) {
+                       if (radix_tree_node_contains_leaves(tree, node)) {
+                               empty = 0;
+                               continue;
+                       }
+
+                       if (radix_tree_prune_branch(tree, link->u.next, now))
+                               radix_tree_clear_link(link);
+                       else
+                               empty = 0;
+
+                       continue;
+               }
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+               else
+                       radix_tree_delete_branch(tree, link->u.next);
+
+               radix_tree_clear_link(link);
+       }
+
+       if (empty && !radix_tree_node_is_root(tree, node))
+               radix_tree_free_node(tree, node);
+
+       return empty;
+}
+
+/*
+ * walk tree and free any node that has been idle for too long
+ */
+static void
+radix_tree_prune(radix_tree_t *tree)
+{
+       struct timeval now;
+
+       if (!tree->root)
+               return;
+
+       DPRINTF("tree %s has %"PRIu64" bytes\n",
+               tree->cache->name, tree->size);
+
+       gettimeofday(&now, NULL);
+       radix_tree_prune_branch(tree, tree->root, now.tv_sec);
+
+       DPRINTF("tree %s now has %"PRIu64" bytes\n",
+               tree->cache->name, tree->size);
+}
+
+static inline int
+radix_tree_initialize(radix_tree_t *tree, uint64_t sectors)
+{
+       tree->height = radix_tree_calculate_height(sectors);
+       tree->root   = radix_tree_allocate_node(tree, tree->height);
+       if (!tree->root)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static inline void
+radix_tree_free(radix_tree_t *tree)
+{
+       radix_tree_destroy(tree);
+}
+
+static void
+block_cache_prune_event(event_id_t id, char mode, void *private)
+{
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       cache = (block_cache_t *)private;
+       tree  = &cache->tree;
+
+       radix_tree_prune(tree);
+}
+
+static inline block_cache_request_t *
+block_cache_get_request(block_cache_t *cache)
+{
+       if (!cache->requests_free)
+               return NULL;
+
+       return cache->request_free_list[--cache->requests_free];
+}
+
+static inline void
+block_cache_put_request(block_cache_t *cache, block_cache_request_t *breq)
+{
+       memset(breq, 0, sizeof(block_cache_request_t));
+       cache->request_free_list[cache->requests_free++] = breq;
+}
+
+static int
+block_cache_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       int i, err;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       if (!td_flag_test(flags, TD_OPEN_RDONLY))
+               return -EINVAL;
+
+       if (driver->info.sector_size != RADIX_TREE_NODE_SIZE)
+               return -EINVAL;
+
+       cache = (block_cache_t *)driver->data;
+       err   = tapdisk_namedup(&cache->name, (char *)name);
+       if (err)
+               return -ENOMEM;
+
+       cache->sectors = driver->info.size;
+
+       tree = &cache->tree;
+       err  = radix_tree_initialize(tree, cache->sectors);
+       if (err)
+               goto fail;
+
+       tree->cache = cache;
+       cache->requests_free = BLOCK_CACHE_REQUESTS;
+       for (i = 0; i < BLOCK_CACHE_REQUESTS; i++)
+               cache->request_free_list[i] = cache->requests + i;
+
+       cache->timeout_id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
+                                                         -1, /* dummy fd */
+                                                         BLOCK_CACHE_PAGE_IDLETIME << 1,
+                                                         block_cache_prune_event,
+                                                         cache);
+       if (cache->timeout_id < 0)
+               goto fail;
+
+       DPRINTF("opening cache for %s, sectors: %"PRIu64", "
+               "tree: %p, height: %d\n",
+               cache->name, cache->sectors, tree, tree->height);
+
+       if (mlockall(MCL_CURRENT | MCL_FUTURE))
+               DPRINTF("mlockall failed: %d\n", -errno);
+
+       return 0;
+
+fail:
+       free(cache->name);
+       radix_tree_free(&cache->tree);
+       return err;
+}
+
+static int
+block_cache_close(td_driver_t *driver)
+{
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       cache = (block_cache_t *)driver->data;
+       tree  = &cache->tree;
+
+       DPRINTF("closing cache for %s\n", cache->name);
+
+       tapdisk_server_unregister_event(cache->timeout_id);
+       radix_tree_free(tree);
+       free(cache->name);
+
+       return 0;
+}
+
+static inline uint64_t
+block_cache_hash(block_cache_t *cache, char *buf)
+{
+       int i, n;
+       uint64_t cksm, *data;
+
+       return 0;
+
+       cksm = 0;
+       data = (uint64_t *)buf;
+       n    = RADIX_TREE_NODE_SIZE / sizeof(uint64_t);
+
+       for (i = 0; i < n; i++)
+               cksm += data[i];
+
+       return ~cksm;
+}
+
+static void
+block_cache_hit(block_cache_t *cache, td_request_t treq, char *iov[])
+{
+       int i;
+       off_t off;
+
+       cache->stats.hits += treq.secs;
+
+       for (i = 0; i < treq.secs; i++) {
+               DBG("%s: block cache hit: sec 0x%08llx, hash: 0x%08llx\n",
+                   cache->name, treq.sec + i, block_cache_hash(cache, iov[i]));
+
+               off = i << RADIX_TREE_NODE_SHIFT;
+               memcpy(treq.buf + off, iov[i], RADIX_TREE_NODE_SIZE);
+       }
+
+       td_complete_request(treq, 0);
+}
+
+static void
+block_cache_populate_cache(td_request_t clone, int err)
+{
+       int i;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+       block_cache_request_t *breq;
+
+       breq        = (block_cache_request_t *)clone.cb_data;
+       cache       = breq->cache;
+       tree        = &cache->tree;
+       breq->secs -= clone.secs;
+       breq->err   = (breq->err ? breq->err : err);
+
+       if (breq->secs)
+               return;
+
+       if (breq->err) {
+               free(breq->buf);
+               goto out;
+       }
+
+       for (i = 0; i < breq->treq.secs; i++) {
+               off_t off = i << RADIX_TREE_NODE_SHIFT;
+               DBG("%s: populating sec 0x%08llx\n",
+                   cache->name, breq->treq.sec + i);
+               memcpy(breq->treq.buf + off,
+                      breq->buf + off, RADIX_TREE_NODE_SIZE);
+       }
+
+       if (radix_tree_add_leaves(tree, breq->buf,
+                                 breq->treq.sec, breq->treq.secs))
+               free(breq->buf);
+
+out:
+       td_complete_request(breq->treq, breq->err);
+       block_cache_put_request(cache, breq);
+}
+
+static void
+block_cache_miss(block_cache_t *cache, td_request_t treq)
+{
+       char *buf;
+       size_t size;
+       td_request_t clone;
+       radix_tree_t *tree;
+       block_cache_request_t *breq;
+
+       DBG("%s: block cache miss: sec 0x%08llx\n", cache->name, treq.sec);
+
+       clone = treq;
+       tree  = &cache->tree;
+       size  = treq.secs << RADIX_TREE_NODE_SHIFT;
+
+       cache->stats.misses += treq.secs;
+
+       if (radix_tree_size(tree) + size >= BLOCK_CACHE_MAX_SIZE)
+               goto out;
+
+       breq = block_cache_get_request(cache);
+       if (!breq)
+               goto out;
+
+       if (posix_memalign((void **)&buf, RADIX_TREE_NODE_SIZE, size)) {
+               block_cache_put_request(cache, breq);
+               goto out;
+       }
+
+       breq->treq    = treq;
+       breq->secs    = treq.secs;
+       breq->err     = 0;
+       breq->buf     = buf;
+       breq->cache   = cache;
+
+       clone.buf     = buf;
+       clone.cb      = block_cache_populate_cache;
+       clone.cb_data = breq;
+
+out:
+       td_forward_request(clone);
+}
+
+static void
+block_cache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       int i;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+       char *iov[BLOCK_CACHE_NODES_PER_PAGE];
+
+       cache = (block_cache_t *)driver->data;
+       tree  = &cache->tree;
+
+       cache->stats.reads += treq.secs;
+
+       if (treq.secs > BLOCK_CACHE_NODES_PER_PAGE)
+               return td_forward_request(treq);
+
+       for (i = 0; i < treq.secs; i++) {
+               iov[i] = radix_tree_find_leaf(tree, treq.sec + i);
+               if (!iov[i])
+                       return block_cache_miss(cache, treq);
+       }
+
+       return block_cache_hit(cache, treq, iov);
+}
+
+static void
+block_cache_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       td_complete_request(treq, -EPERM);
+}
+
+static int
+block_cache_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       return -EINVAL;
+}
+
+static int
+block_cache_validate_parent(td_driver_t *driver,
+                           td_driver_t *pdriver, td_flag_t flags)
+{
+       block_cache_t *cache;
+
+       if (!td_flag_test(pdriver->state, TD_DRIVER_RDONLY))
+               return -EINVAL;
+
+       cache = (block_cache_t *)driver->data;
+       if (strcmp(driver->name, pdriver->name))
+               return -EINVAL;
+
+       return 0;
+}
+
+static void
+block_cache_debug(td_driver_t *driver)
+{
+       block_cache_t *cache;
+       block_cache_stats_t *stats;
+
+       cache = (block_cache_t *)driver->data;
+       stats = &cache->stats;
+
+       WARN("BLOCK CACHE %s\n", cache->name);
+       WARN("reads: %"PRIu64", hits: %"PRIu64", misses: %"PRIu64", prunes: %"PRIu64"\n",
+            stats->reads, stats->hits, stats->misses, stats->prunes);
+}
+
+struct tap_disk tapdisk_block_cache = {
+       .disk_type                  = "tapdisk_block_cache",
+       .flags                      = 0,
+       .private_data_size          = sizeof(block_cache_t),
+       .td_open                    = block_cache_open,
+       .td_close                   = block_cache_close,
+       .td_queue_read              = block_cache_queue_read,
+       .td_queue_write             = block_cache_queue_write,
+       .td_get_parent_id           = block_cache_get_parent_id,
+       .td_validate_parent         = block_cache_validate_parent,
+       .td_debug                   = block_cache_debug,
+};
diff --git a/tools/blktap2/drivers/block-log.c b/tools/blktap2/drivers/block-log.c
new file mode 100644 (file)
index 0000000..5330cdc
--- /dev/null
@@ -0,0 +1,665 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver to sit on top of another disk and log writes, in order
+ * to synchronize two distinct disks
+ *
+ * On receipt of a control request it can export a list of dirty
+ * sectors in the following format:
+ * struct writerange {
+ *   u64 sector;
+ *   u32 count;
+ * }
+ * terminated by { 0, 0 }
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "xc_bitops.h"
+#include "log.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_CONNECTIONS 1
+
+typedef struct poll_fd {
+  int          fd;
+  event_id_t   id;
+} poll_fd_t;
+
+struct tdlog_state {
+  uint64_t     size;
+
+  void*        writelog;
+
+  char*        ctlpath;
+  poll_fd_t    ctl;
+
+  int          connected;
+  poll_fd_t    connections[MAX_CONNECTIONS];
+
+  char*        shmpath;
+  void*        shm;
+
+  log_sring_t* sring;
+  log_back_ring_t bring;
+};
+
+#define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a)
+
+#define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a)
+
+static void ctl_accept(event_id_t, char, void *);
+static void ctl_request(event_id_t, char, void *);
+
+/* -- write log -- */
+
+/* large flat bitmaps don't scale particularly well either in size or scan
+ * time, but they'll do for now */
+
+static int writelog_create(struct tdlog_state *s)
+{
+  uint64_t bmsize;
+
+  bmsize = bitmap_size(s->size);
+
+  BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize);
+
+  s->writelog = bitmap_alloc(s->size);
+  if (!s->writelog) {
+    BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize);
+    return -1;
+  }
+
+  return 0;
+}
+
+static int writelog_free(struct tdlog_state *s)
+{
+  if (s->writelog)
+    free(s->writelog);
+
+  return 0;
+}
+
+static int writelog_set(struct tdlog_state* s, uint64_t sector, int count)
+{
+  int i;
+
+  for (i = 0; i < count; i++) 
+    set_bit(sector + i, s->writelog);
+
+  return 0;
+}
+
+/* if end is 0, clear to end of disk */
+int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end)
+{
+  if (!end)
+    end = s->size;
+
+  /* clear to word boundaries */
+  while (BITMAP_SHIFT(start))
+    clear_bit(start++, s->writelog);
+  while (BITMAP_SHIFT(end))
+    clear_bit(end--, s->writelog);
+
+  memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3);
+
+  return 0;
+}
+
+/* returns last block exported (may not be end of disk if shm region
+ * overflows) */
+static uint64_t writelog_export(struct tdlog_state* s)
+{
+  struct disk_range* range = s->shm;
+  uint64_t i = 0;
+
+  BDPRINTF("sector count: %"PRIu64, s->size);
+
+  for (i = 0; i < s->size; i++) {
+    if (test_bit(i, s->writelog)) {
+      /* range start */
+      range->sector = i;
+      range->count = 1;
+      /* find end */
+      for (i++; i < s->size && test_bit(i, s->writelog); i++)
+       range->count++;
+
+      BDPRINTF("export: dirty extent %"PRIu64":%u",
+              range->sector, range->count);
+      range++;
+
+      /* out of space in shared memory region */
+      if ((void*)range >= bmend(s->shm)) {
+       BDPRINTF("out of space in shm region at sector %"PRIu64, i);
+       return i;
+      }
+
+      /* undo forloop increment */
+      i--;
+    }
+  }
+
+  /* NULL-terminate range list */
+  range->sector = 0;
+  range->count = 0;
+
+  return i;
+}
+
+/* -- communication channel -- */
+
+/* remove FS special characters in up to len bytes of path */
+static inline void path_escape(char* path, size_t len) {
+  int i;
+
+  for (i = 0; i < len && path[i]; i++)
+    if (strchr(":/", path[i]))
+      path[i] = '_';
+}
+
+static char* ctl_makepath(const char* name, const char* ext)
+{
+  char* res;
+  char *file;
+
+  file = strrchr(name, '/');
+  if (!file) {
+    BWPRINTF("invalid name %s\n", name);
+    return NULL;
+  }
+
+  if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) {
+    BWPRINTF("could not allocate path");
+    return NULL;
+  }
+
+  path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file));
+
+  return res;
+}
+
+static int shmem_open(struct tdlog_state* s, const char* name)
+{
+  int i, l, fd;
+
+  /* device name -> path */
+  if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) {
+    BWPRINTF("could not allocate shm path");
+    return -1;
+  }
+
+  path_escape(s->shmpath + 5, strlen(name));
+
+  if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) {
+    BWPRINTF("could not open shared memory file %s: %s", s->shmpath,
+            strerror(errno));
+    goto err;
+  }
+  if (ftruncate(fd, SHMSIZE) < 0) {
+    BWPRINTF("error truncating shmem to size %u", SHMSIZE);
+    close(fd);
+    goto err;
+  }
+
+  s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+  close(fd);
+  if (s->shm == MAP_FAILED) {
+    BWPRINTF("could not mmap write log shm: %s", strerror(errno));
+    goto err;
+  }
+  return 0;
+
+  err:
+  s->shm = NULL;
+  free(s->shmpath);
+  s->shmpath = NULL;
+  return -1;
+}
+
+static int shmem_close(struct tdlog_state* s)
+{
+  if (s->shm) {
+    munmap(s->shm, SHMSIZE);
+    s->shm = NULL;
+  }
+
+  if (s->shmpath) {
+    shm_unlink(s->shmpath);
+    s->shmpath = NULL;
+  }
+
+  return 0;
+}
+
+/* control socket */
+
+static int ctl_open(struct tdlog_state* s, const char* name)
+{
+  struct sockaddr_un saddr;
+
+  if (!(s->ctlpath = ctl_makepath(name, "ctl")))
+    return -1;
+
+  if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+    BWPRINTF("error opening control socket: %s", strerror(errno));
+    goto err;
+  }
+
+  memset(&saddr, 0, sizeof(saddr));
+  saddr.sun_family = AF_UNIX;
+  memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath));
+  if (unlink(s->ctlpath) && errno != ENOENT) {
+    BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath,
+            strerror(errno));
+    goto err_sock;
+  }
+    
+  if (bind(s->ctl.fd, (const struct sockaddr *)&saddr, sizeof(saddr)) < 0) {
+    BWPRINTF("error binding control socket to %s: %s", s->ctlpath,
+            strerror(errno));
+    goto err_sock;
+  }
+
+  if (listen(s->ctl.fd, 1) < 0) {
+    BWPRINTF("error listening on control socket: %s", strerror(errno));
+    goto err_sock;
+  }
+
+  s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           s->ctl.fd, 0, ctl_accept, s);
+  if (s->ctl.id < 0) {
+    BWPRINTF("error register event handler: %s", strerror(s->ctl.id));
+    goto err_sock;
+  }
+
+  return 0;
+
+  err_sock:
+  close(s->ctl.fd);
+  s->ctl.fd = -1;
+  err:
+  free(s->ctlpath);
+  s->ctlpath = NULL;
+
+  return -1;
+}
+
+static int ctl_close(struct tdlog_state* s)
+{
+  while (s->connected) {
+    s->connected--;
+    tapdisk_server_unregister_event(s->connections[s->connected].id);
+    close(s->connections[s->connected].fd);
+    s->connections[s->connected].fd = -1;
+    s->connections[s->connected].id = 0;
+  }
+
+  if (s->ctl.fd >= 0) {
+    tapdisk_server_unregister_event(s->ctl.id);
+    close(s->ctl.fd);
+    s->ctl.fd = -1;
+    s->ctl.id = 0;
+  }
+
+  if (s->ctlpath) {
+    unlink(s->ctlpath);
+    free(s->ctlpath);
+    s->ctlpath = NULL;
+  }
+
+  /* XXX this must be fixed once requests are actually in flight */
+  /* could just drain the existing ring here first */
+  if (s->sring) {
+    SHARED_RING_INIT(s->sring);
+    BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+  }
+
+  return 0;
+}
+
+/* walk list of open sockets, close matching fd */
+static int ctl_close_sock(struct tdlog_state* s, int fd)
+{
+  int i;
+
+  for (i = 0; i < s->connected; i++) {
+    if (s->connections[i].fd == fd) {
+      tapdisk_server_unregister_event(s->connections[i].id);
+      close(s->connections[i].fd);
+      s->connections[i].fd = -1;
+      s->connections[i].id = 0;
+      s->connected--;
+      return 0;
+    }
+  }
+
+  BWPRINTF("requested to close unknown socket %d", fd);
+  return -1;
+}
+
+static void ctl_accept(event_id_t id, char mode, void *private)
+{
+  struct tdlog_state* s = (struct tdlog_state *)private;
+  int fd;
+  event_id_t cid;
+
+  if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) {
+    BWPRINTF("error accepting control connection: %s", strerror(errno));
+    return;
+  }
+
+  if (s->connected) {
+    BWPRINTF("control session in progress, closing new connection");
+    close(fd);
+    return;
+  }
+
+  cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                     fd, 0, ctl_request, s);
+  if (cid < 0) {
+    BWPRINTF("error registering connection event handler: %s", strerror(cid));
+    close(fd);
+    return;
+  }
+
+  s->connections[s->connected].fd = fd;
+  s->connections[s->connected].id = cid;
+  s->connected++;
+}
+
+/* response format: 4 bytes shmsize, 0-terminated path */
+static int ctl_get_shmpath(struct tdlog_state* s, int fd)
+{
+  char msg[CTLRSPLEN_SHMP + 1];
+  uint32_t sz;
+  int rc;
+
+  BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)",
+          SHMSIZE, s->shmpath);
+
+  /* TMP: sanity-check shm */
+  sz = 0xdeadbeef;
+  memcpy(s->shm, &sz, sizeof(sz));
+
+  sz = SHMSIZE;
+  memcpy(msg, &sz, sizeof(sz));
+  snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath);
+  if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) {
+    BWPRINTF("error writing shmpath: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_peek_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: peeking bitmap");
+
+  writelog_export(s);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) {
+    BWPRINTF("error writing peek ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_clear_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: clearing bitmap");
+
+  writelog_clear(s, 0, 0);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) {
+    BWPRINTF("error writing clear ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+/* get dirty bitmap and clear it atomically */
+static int ctl_get_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: getting bitmap");
+
+  writelog_export(s);
+  writelog_clear(s, 0, 0);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) {
+    BWPRINTF("error writing get ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+/* get requests from ring */
+static int ctl_kick(struct tdlog_state* s, int fd)
+{
+  RING_IDX reqstart, reqend;
+  log_request_t req;
+
+  /* XXX testing */
+  RING_IDX rspstart, rspend;
+  log_response_t rsp;
+  struct log_ctlmsg msg;
+  int rc;
+
+  reqstart = s->bring.req_cons;
+  reqend = s->sring->req_prod;
+
+  BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend);
+
+  while (reqstart != reqend) {
+    /* XXX actually submit these! */
+    memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req));
+    BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count);
+    s->bring.req_cons = ++reqstart;
+
+    rsp.sector = req.sector;
+    rsp.count = req.count;
+    memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp,
+          sizeof(rsp));
+    s->bring.rsp_prod_pvt++;
+  }
+
+  RING_PUSH_RESPONSES(&s->bring);
+  memset(&msg, 0, sizeof(msg));
+  memcpy(msg.msg, LOGCMD_KICK, 4);
+  if ((rc = write(fd, &msg, sizeof(msg))) < 0) {
+    BWPRINTF("error sending notify: %s", strerror(errno));
+    return -1;
+  } else if (rc < sizeof(msg)) {
+    BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* msg)
+{
+  if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) {
+    return ctl_get_shmpath(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) {
+    return ctl_peek_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) {
+    return ctl_clear_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) {
+    return ctl_get_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) {
+    return ctl_kick(s, fd);
+  }
+
+  BWPRINTF("unknown control request %.4s", msg->msg);
+  return -1;
+}
+
+static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id)
+{
+  int i;
+
+  for (i = 0; i < s->connected; i++)
+    if (s->connections[i].id == id)
+      return s->connections[i].fd;
+
+  BWPRINTF("unrecognized event callback id %d", id);
+  return -1;
+}
+
+static void ctl_request(event_id_t id, char mode, void *private)
+{
+  struct tdlog_state* s = (struct tdlog_state*)private;
+  struct log_ctlmsg msg;
+  int rc, i, fd = -1;
+
+  fd = ctl_find_connection(s, id);
+  if (fd == -1)
+    return;
+
+  if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
+    BWPRINTF("error reading from ctl socket %d, closing: %s", fd,
+            strerror(errno));
+    ctl_close_sock(s, fd);
+    return;
+  } else if (rc == 0) {
+    BDPRINTF("ctl_request: EOF, closing socket");
+    ctl_close_sock(s, fd);
+    return;
+  } else if (rc < sizeof(msg)) {
+    BWPRINTF("short request received (%d/%zd bytes), ignoring", rc,
+            sizeof(msg));
+    return;
+  }
+
+  ctl_do_request(s, fd, &msg);
+}
+
+/* -- interface -- */
+
+static int tdlog_close(td_driver_t*);
+
+static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+  int rc;
+
+  memset(s, 0, sizeof(*s));
+
+  s->size = driver->info.size;
+
+  if ((rc = writelog_create(s))) {
+    tdlog_close(driver);
+    return rc;
+  }
+  if ((rc = shmem_open(s, name))) {
+    tdlog_close(driver);
+    return rc;
+  }
+  if ((rc = ctl_open(s, name))) {
+    tdlog_close(driver);
+    return rc;
+  }
+
+  s->sring = (log_sring_t*)sringstart(s->shm);
+  SHARED_RING_INIT(s->sring);
+  BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+
+  BDPRINTF("opened ctl socket");
+
+  return 0;
+}
+
+static int tdlog_close(td_driver_t* driver)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+
+  ctl_close(s);
+  shmem_close(s);
+  writelog_free(s);
+
+  return 0;
+}
+
+static void tdlog_queue_read(td_driver_t* driver, td_request_t treq)
+{
+  td_forward_request(treq);
+}
+
+static void tdlog_queue_write(td_driver_t* driver, td_request_t treq)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+  int rc;
+
+  writelog_set(s, treq.sec, treq.secs);
+  td_forward_request(treq);
+}
+
+static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id)
+{
+  return -EINVAL;
+}
+
+static int tdlog_validate_parent(td_driver_t *driver,
+                                td_driver_t *parent, td_flag_t flags)
+{
+  return 0;
+}
+
+struct tap_disk tapdisk_log = {
+  .disk_type          = "tapdisk_log",
+  .private_data_size  = sizeof(struct tdlog_state),
+  .flags              = 0,
+  .td_open            = tdlog_open,
+  .td_close           = tdlog_close,
+  .td_queue_read      = tdlog_queue_read,
+  .td_queue_write     = tdlog_queue_write,
+  .td_get_parent_id   = tdlog_get_parent_id,
+  .td_validate_parent = tdlog_validate_parent,
+};
diff --git a/tools/blktap2/drivers/block-qcow.c b/tools/blktap2/drivers/block-qcow.c
new file mode 100644 (file)
index 0000000..b45bcaa
--- /dev/null
@@ -0,0 +1,1501 @@
+/* block-qcow.c
+ *
+ * Asynchronous Qemu copy-on-write disk implementation.
+ * Code based on the Qemu implementation
+ * (see copyright notice below)
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ */
+
+/*
+ * Block driver for the QCOW format
+ * 
+ * Copyright (c) 2004 Fabrice Bellard
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files(the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <zlib.h>
+#include <inttypes.h>
+#include <libaio.h>
+#include <limits.h>
+#include "bswap.h"
+#include "aes.h"
+#include "md5.h"
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+#include "qcow.h"
+#include "blk.h"
+#include "atomicio.h"
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE     0
+#endif
+
+#if 1
+#define ASSERT(_p) \
+    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif
+
+struct pending_aio {
+        td_callback_t cb;
+        int id;
+        void *private;
+       int nb_sectors;
+       char *buf;
+       uint64_t sector;
+};
+
+#undef IOCB_IDX
+#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
+
+#define ZERO_TEST(_b) (_b | 0x00)
+
+struct qcow_request {
+       td_request_t         treq;
+       struct tiocb         tiocb;
+       struct tdqcow_state  *state;
+};
+
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
+
+uint32_t gen_cksum(char *ptr, int len)
+{
+  int i;
+  uint32_t md[4];
+
+  /* Generate checksum */
+  md5_sum((const uint8_t*)ptr, len, (uint8_t*)md);
+
+  return md[0];
+}
+
+static void free_aio_state(struct tdqcow_state* s)
+{
+       free(s->aio_requests);
+       free(s->aio_free_list);
+}
+
+static int init_aio_state(td_driver_t *driver)
+{
+       int i, ret;
+       td_disk_info_t *bs = &(driver->info);
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       
+        // A segment (i.e. a page) can span multiple clusters
+        s->max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
+         MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
+
+       s->aio_free_count = s->max_aio_reqs;
+
+       if (!(s->aio_requests  = calloc(s->max_aio_reqs, sizeof(struct qcow_request))) || 
+           !(s->aio_free_list = calloc(s->max_aio_reqs, sizeof(struct qcow_request)))) {
+           DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n",
+                   s->max_aio_reqs);
+           goto fail;
+       }
+
+       for (i = 0; i < s->max_aio_reqs; i++)
+               s->aio_free_list[i] = &s->aio_requests[i];
+
+        DPRINTF("AIO state initialised\n");
+
+        return 0;
+ fail:
+       return -1;
+}
+
+int get_filesize(char *filename, uint64_t *size, struct stat *st)
+{
+       int fd;
+       QCowHeader header;
+
+       /*Set to the backing file size*/
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               return -1;
+       if (read(fd, &header, sizeof(header)) < sizeof(header)) {
+               close(fd);
+               return -1;
+       }
+       close(fd);
+       
+       be32_to_cpus(&header.magic);
+       be64_to_cpus(&header.size);
+       if (header.magic == QCOW_MAGIC) {
+               *size = header.size >> SECTOR_SHIFT;
+               return 0;
+       }
+
+       if(S_ISBLK(st->st_mode)) {
+               fd = open(filename, O_RDONLY);
+               if (fd < 0)
+                       return -1;
+               if (blk_getimagesize(fd, size) != 0) {
+                       printf("Unable to get Block device size\n");
+                       close(fd);
+                       return -1;
+               }
+               close(fd);
+       } else *size = (st->st_size >> SECTOR_SHIFT);   
+       return 0;
+}
+
+static int qcow_set_key(struct tdqcow_state *s, const char *key)
+{
+       uint8_t keybuf[16];
+       int len, i;
+       
+       memset(keybuf, 0, 16);
+       len = strlen(key);
+       if (len > 16)
+               len = 16;
+       /* XXX: we could compress the chars to 7 bits to increase
+          entropy */
+       for (i = 0; i < len; i++) {
+               keybuf[i] = key[i];
+       }
+       s->crypt_method = s->crypt_method_header;
+       
+       if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+               return -1;
+       if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+               return -1;
+#if 0
+       /* test */
+       {
+               uint8_t in[16];
+               uint8_t out[16];
+               uint8_t tmp[16];
+               for (i=0; i<16; i++)
+                       in[i] = i;
+               AES_encrypt(in, tmp, &s->aes_encrypt_key);
+               AES_decrypt(tmp, out, &s->aes_decrypt_key);
+               for (i = 0; i < 16; i++)
+                       DPRINTF(" %02x", tmp[i]);
+               DPRINTF("\n");
+               for (i = 0; i < 16; i++)
+                       DPRINTF(" %02x", out[i]);
+               DPRINTF("\n");
+       }
+#endif
+       return 0;
+}
+
+void tdqcow_complete(void *arg, struct tiocb *tiocb, int err)
+{
+       struct qcow_request *aio = (struct qcow_request *)arg;
+       struct tdqcow_state *s = aio->state;
+
+       td_complete_request(aio->treq, err);
+
+       s->aio_free_list[s->aio_free_count++] = aio;
+}
+
+static void async_read(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct qcow_request *aio;
+       struct tdqcow_state *prv;
+
+       prv    = (struct tdqcow_state *)driver->data;
+       size   = treq.secs * driver->info.sector_size;
+       offset = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_read(&aio->tiocb, prv->fd, treq.buf,
+                    size, offset, tdqcow_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+static void async_write(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct qcow_request *aio;
+       struct tdqcow_state *prv;
+
+       prv     = (struct tdqcow_state *)driver->data;
+       size    = treq.secs * driver->info.sector_size;
+       offset  = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_write(&aio->tiocb, prv->fd, treq.buf,
+                     size, offset, tdqcow_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+/* 
+ * The crypt function is compatible with the linux cryptoloop
+ * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ * supported .
+ */
+static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
+                            uint8_t *out_buf, const uint8_t *in_buf,
+                            int nb_sectors, int enc,
+                            const AES_KEY *key)
+{
+       union {
+               uint64_t ll[2];
+               uint8_t b[16];
+       } ivec;
+       int i;
+       
+       for (i = 0; i < nb_sectors; i++) {
+               ivec.ll[0] = cpu_to_le64(sector_num);
+               ivec.ll[1] = 0;
+               AES_cbc_encrypt(in_buf, out_buf, 512, key, 
+                               ivec.b, enc);
+               sector_num++;
+               in_buf += 512;
+               out_buf += 512;
+       }
+}
+
+int qtruncate(int fd, off_t length, int sparse)
+{
+       int ret, i; 
+       int current = 0, rem = 0;
+       uint64_t sectors;
+       struct stat st;
+       char *buf;
+
+       /* If length is greater than the current file len
+        * we synchronously write zeroes to the end of the 
+        * file, otherwise we truncate the length down
+        */
+       ret = fstat(fd, &st);
+       if (ret == -1) 
+               return -1;
+       if (S_ISBLK(st.st_mode))
+               return 0;
+
+       sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
+       current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
+       rem     = st.st_size % DEFAULT_SECTOR_SIZE;
+
+       /* If we are extending this file, we write zeros to the end --
+        * this tries to ensure that the extents allocated wind up being
+        * contiguous on disk.
+        */
+       if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
+               /*We are extending the file*/
+               if ((ret = posix_memalign((void **)&buf, 
+                                         512, DEFAULT_SECTOR_SIZE))) {
+                       DPRINTF("posix_memalign failed: %d\n", ret);
+                       return -1;
+               }
+               memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
+               if (lseek(fd, 0, SEEK_END)==-1) {
+                       DPRINTF("Lseek EOF failed (%d), internal error\n",
+                               errno);
+                       free(buf);
+                       return -1;
+               }
+               if (rem) {
+                       ret = write(fd, buf, rem);
+                       if (ret != rem) {
+                               DPRINTF("write failed: ret = %d, err = %s\n",
+                                       ret, strerror(errno));
+                               free(buf);
+                               return -1;
+                       }
+               }
+               for (i = current; i < sectors; i++ ) {
+                       ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
+                       if (ret != DEFAULT_SECTOR_SIZE) {
+                               DPRINTF("write failed: ret = %d, err = %s\n",
+                                       ret, strerror(errno));
+                               free(buf);
+                               return -1;
+                       }
+               }
+               free(buf);
+       } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
+               if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
+                       DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
+                       return -1;
+               }
+       return 0;
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size 
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(struct tdqcow_state *s,
+                                   uint64_t offset, int allocate,
+                                   int compressed_size,
+                                   int n_start, int n_end)
+{
+       int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
+       char *tmp_ptr2, *l2_ptr, *l1_ptr;
+       uint64_t *tmp_ptr;
+       uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+       uint32_t min_count;
+       int new_l2_table;
+
+       /*Check L1 table for the extent offset*/
+       l1_index = offset >> (s->l2_bits + s->cluster_bits);
+       l2_offset = s->l1_table[l1_index];
+       new_l2_table = 0;
+       if (!l2_offset) {
+               if (!allocate)
+                       return 0;
+               /* 
+                * allocating a new l2 entry + extent 
+                * at the end of the file, we must also
+                * update the L1 entry safely.
+                */
+               l2_offset = s->fd_end;
+
+               /* round to cluster size */
+               l2_offset = (l2_offset + s->cluster_size - 1) 
+                       & ~(s->cluster_size - 1);
+
+               /* update the L1 entry */
+               s->l1_table[l1_index] = l2_offset;
+               
+               /*Truncate file for L2 table 
+                *(initialised to zero in case we crash)*/
+               if (qtruncate(s->fd, 
+                             l2_offset + (s->l2_size * sizeof(uint64_t)),
+                             s->sparse) != 0) {
+                       DPRINTF("ERROR truncating file\n");
+                       return 0;
+               }
+               s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
+
+               /*Update the L1 table entry on disk
+                 * (for O_DIRECT we write 4KByte blocks)*/
+               l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
+               l1_ptr = (char *)s->l1_table + (l1_sector << 12);
+
+               if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
+                       DPRINTF("ERROR allocating memory for L1 table\n");
+                        return 0;
+               }
+               memcpy(tmp_ptr, l1_ptr, 4096);
+
+               /* Convert block to write to big endian */
+               for(i = 0; i < 4096 / sizeof(uint64_t); i++) {
+                       cpu_to_be64s(&tmp_ptr[i]);
+               }
+
+               /*
+                * Issue non-asynchronous L1 write.
+                * For safety, we must ensure that
+                * entry is written before blocks.
+                */
+               lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
+               if (write(s->fd, tmp_ptr, 4096) != 4096) {
+                       free(tmp_ptr);
+                       return 0;
+               }
+               free(tmp_ptr);
+
+               new_l2_table = 1;
+               goto cache_miss;
+       } else if (s->min_cluster_alloc == s->l2_size) {
+               /*Fast-track the request*/
+               cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
+               l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+               return cluster_offset + (l2_index * s->cluster_size);
+       }
+
+       /*Check to see if L2 entry is already cached*/
+       for (i = 0; i < L2_CACHE_SIZE; i++) {
+               if (l2_offset == s->l2_cache_offsets[i]) {
+                       /* increment the hit count */
+                       if (++s->l2_cache_counts[i] == 0xffffffff) {
+                               for (j = 0; j < L2_CACHE_SIZE; j++) {
+                                       s->l2_cache_counts[j] >>= 1;
+                               }
+                       }
+                       l2_table = s->l2_cache + (i << s->l2_bits);
+                       goto found;
+               }
+       }
+
+cache_miss:
+       /* not found: load a new entry in the least used one */
+       min_index = 0;
+       min_count = 0xffffffff;
+       for (i = 0; i < L2_CACHE_SIZE; i++) {
+               if (s->l2_cache_counts[i] < min_count) {
+                       min_count = s->l2_cache_counts[i];
+                       min_index = i;
+               }
+       }
+       l2_table = s->l2_cache + (min_index << s->l2_bits);
+
+       /*If extent pre-allocated, read table from disk, 
+        *otherwise write new table to disk*/
+       if (new_l2_table) {
+               /*Should we allocate the whole extent? Adjustable parameter.*/
+               if (s->cluster_alloc == s->l2_size) {
+                       cluster_offset = l2_offset + 
+                               (s->l2_size * sizeof(uint64_t));
+                       cluster_offset = (cluster_offset + s->cluster_size - 1)
+                               & ~(s->cluster_size - 1);
+                       if (qtruncate(s->fd, cluster_offset + 
+                                 (s->cluster_size * s->l2_size), 
+                                     s->sparse) != 0) {
+                               DPRINTF("ERROR truncating file\n");
+                               return 0;
+                       }
+                       s->fd_end = cluster_offset + 
+                               (s->cluster_size * s->l2_size);
+                       for (i = 0; i < s->l2_size; i++) {
+                               l2_table[i] = cpu_to_be64(cluster_offset + 
+                                                         (i*s->cluster_size));
+                       }  
+               } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+
+               lseek(s->fd, l2_offset, SEEK_SET);
+               if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
+                  s->l2_size * sizeof(uint64_t))
+                       return 0;
+       } else {
+               lseek(s->fd, l2_offset, SEEK_SET);
+               if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) != 
+                   s->l2_size * sizeof(uint64_t))
+                       return 0;
+       }
+       
+       /*Update the cache entries*/ 
+       s->l2_cache_offsets[min_index] = l2_offset;
+       s->l2_cache_counts[min_index] = 1;
+
+found:
+       /*The extent is split into 's->l2_size' blocks of 
+        *size 's->cluster_size'*/
+       l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+       cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+       if (!cluster_offset || 
+           ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
+               if (!allocate)
+                       return 0;
+               
+               if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+                   (n_end - n_start) < s->cluster_sectors) {
+                       /* cluster is already allocated but compressed, we must
+                          decompress it in the case it is not completely
+                          overwritten */
+                       if (decompress_cluster(s, cluster_offset) < 0)
+                               return 0;
+                       cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
+                       cluster_offset = (cluster_offset + s->cluster_size - 1)
+                               & ~(s->cluster_size - 1);
+                       /* write the cluster content - not asynchronous */
+                       lseek(s->fd, cluster_offset, SEEK_SET);
+                       if (write(s->fd, s->cluster_cache, s->cluster_size) != 
+                           s->cluster_size)
+                           return -1;
+               } else {
+                       /* allocate a new cluster */
+                       cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
+                       if (allocate == 1) {
+                               /* round to cluster size */
+                               cluster_offset = 
+                                       (cluster_offset + s->cluster_size - 1) 
+                                       & ~(s->cluster_size - 1);
+                               if (qtruncate(s->fd, cluster_offset + 
+                                             s->cluster_size, s->sparse)!=0) {
+                                       DPRINTF("ERROR truncating file\n");
+                                       return 0;
+                               }
+                               s->fd_end = (cluster_offset + s->cluster_size);
+                               /* if encrypted, we must initialize the cluster
+                                  content which won't be written */
+                               if (s->crypt_method && 
+                                   (n_end - n_start) < s->cluster_sectors) {
+                                       uint64_t start_sect;
+                                       start_sect = (offset & 
+                                                     ~(s->cluster_size - 1)) 
+                                                             >> 9;
+                                       memset(s->cluster_data + 512, 
+                                              0xaa, 512);
+                                       for (i = 0; i < s->cluster_sectors;i++)
+                                       {
+                                               if (i < n_start || i >= n_end) 
+                                               {
+                                                       encrypt_sectors(s, start_sect + i, 
+                                                                       s->cluster_data, 
+                                                                       s->cluster_data + 512, 1, 1,
+                                                                       &s->aes_encrypt_key);
+                                                       lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
+                                                       if (write(s->fd, s->cluster_data, 512) != 512)
+                                                               return -1;
+                                               }
+                                       }
+                               }
+                       } else {
+                               cluster_offset |= QCOW_OFLAG_COMPRESSED | 
+                                       (uint64_t)compressed_size 
+                                               << (63 - s->cluster_bits);
+                       }
+               }
+               /* update L2 table */
+               tmp = cpu_to_be64(cluster_offset);
+               l2_table[l2_index] = tmp;
+
+               /*For IO_DIRECT we write 4KByte blocks*/
+               l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
+               l2_ptr = (char *)l2_table + (l2_sector << 12);
+               
+               if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
+                       DPRINTF("ERROR allocating memory for L1 table\n");
+                        return 0;
+               }
+               memcpy(tmp_ptr2, l2_ptr, 4096);
+               lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
+               if (write(s->fd, tmp_ptr2, 4096) != 4096) {
+                       free(tmp_ptr2);
+                       return -1;
+               }
+               free(tmp_ptr2);
+       }
+       return cluster_offset;
+}
+
+static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
+                             int nb_sectors, int *pnum)
+{
+       int index_in_cluster, n;
+       uint64_t cluster_offset;
+
+       cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
+       index_in_cluster = sector_num & (s->cluster_sectors - 1);
+       n = s->cluster_sectors - index_in_cluster;
+       if (n > nb_sectors)
+               n = nb_sectors;
+       *pnum = n;
+       return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+       z_stream strm1, *strm = &strm1;
+       int ret, out_len;
+       
+       memset(strm, 0, sizeof(*strm));
+       
+       strm->next_in = (uint8_t *)buf;
+       strm->avail_in = buf_size;
+       strm->next_out = out_buf;
+       strm->avail_out = out_buf_size;
+       
+       ret = inflateInit2(strm, -12);
+       if (ret != Z_OK)
+               return -1;
+       ret = inflate(strm, Z_FINISH);
+       out_len = strm->next_out - out_buf;
+       if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+           (out_len != out_buf_size) ) {
+               inflateEnd(strm);
+               return -1;
+       }
+       inflateEnd(strm);
+       return 0;
+}
+                              
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
+{
+       int ret, csize;
+       uint64_t coffset;
+
+       coffset = cluster_offset & s->cluster_offset_mask;
+       if (s->cluster_cache_offset != coffset) {
+               csize = cluster_offset >> (63 - s->cluster_bits);
+               csize &= (s->cluster_size - 1);
+               lseek(s->fd, coffset, SEEK_SET);
+               ret = read(s->fd, s->cluster_data, csize);
+               if (ret != csize) 
+                       return -1;
+               if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                                     s->cluster_data, csize) < 0) {
+                       return -1;
+               }
+               s->cluster_cache_offset = coffset;
+       }
+       return 0;
+}
+
+static int
+tdqcow_read_header(int fd, QCowHeader *header)
+{
+       int err;
+       char *buf;
+       struct stat st;
+       size_t size, expected;
+
+       memset(header, 0, sizeof(*header));
+
+       err = fstat(fd, &st);
+       if (err)
+               return -errno;
+
+       err = lseek(fd, 0, SEEK_SET);
+       if (err == (off_t)-1)
+               return -errno;
+
+       size = (sizeof(*header) + 511) & ~511;
+       err = posix_memalign((void **)&buf, 512, size);
+       if (err)
+               return err;
+
+       expected = size;
+       if (st.st_size < size)
+               expected = st.st_size;
+
+       errno = 0;
+       err = read(fd, buf, size);
+       if (err != expected) {
+               err = (errno ? -errno : -EIO);
+               goto out;
+       }
+
+       memcpy(header, buf, sizeof(*header));
+       be32_to_cpus(&header->magic);
+       be32_to_cpus(&header->version);
+       be64_to_cpus(&header->backing_file_offset);
+       be32_to_cpus(&header->backing_file_size);
+       be32_to_cpus(&header->mtime);
+       be64_to_cpus(&header->size);
+       be32_to_cpus(&header->crypt_method);
+       be64_to_cpus(&header->l1_table_offset);
+
+       err = 0;
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+tdqcow_load_l1_table(struct tdqcow_state *s, QCowHeader *header)
+{
+       char *buf;
+       struct stat st;
+       size_t expected;
+       int i, err, shift;
+       QCowHeader_ext *exthdr;
+       uint32_t l1_table_bytes, l1_table_block, l1_table_size;
+
+       buf         = NULL;
+       s->l1_table = NULL;
+
+       shift = s->cluster_bits + s->l2_bits;
+
+       s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
+       s->l1_table_offset = header->l1_table_offset;
+
+       s->min_cluster_alloc = 1; /* default */
+
+       l1_table_bytes = s->l1_size * sizeof(uint64_t);
+       l1_table_size  = (l1_table_bytes + 4095) & ~4095;
+       l1_table_block = (l1_table_bytes + s->l1_table_offset + 4095) & ~4095;
+
+       DPRINTF("L1 Table offset detected: %"PRIu64", size %d (%d)\n",
+               (uint64_t)s->l1_table_offset,
+               (int) (s->l1_size * sizeof(uint64_t)), 
+               l1_table_size);
+
+       err = fstat(s->fd, &st);
+       if (err) {
+               err = -errno;
+               goto out;
+       }
+
+       err = lseek(s->fd, 0, SEEK_SET);
+       if (err == (off_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       err = posix_memalign((void **)&buf, 512, l1_table_block);
+       if (err) {
+               buf = NULL;
+               goto out;
+       }
+
+       err = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
+       if (err) {
+               s->l1_table = NULL;
+               goto out;
+       }
+
+       memset(buf, 0, l1_table_block);
+       memset(s->l1_table, 0, l1_table_size);
+
+       expected = l1_table_block;
+       if (st.st_size < l1_table_block)
+               expected = st.st_size;
+
+       errno = 0;
+       err = read(s->fd, buf, l1_table_block);
+       if (err != expected) {
+               err = (errno ? -errno : -EIO);
+               goto out;
+       }
+
+       memcpy(s->l1_table, buf + s->l1_table_offset, l1_table_size);
+       exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
+
+       /* check for xen extended header */
+       if (s->l1_table_offset % 4096 == 0 &&
+           be32_to_cpu(exthdr->xmagic) == XEN_MAGIC) {
+               uint32_t flags = be32_to_cpu(exthdr->flags);
+               uint32_t cksum = be32_to_cpu(exthdr->cksum);
+
+               /*
+                * Try to detect old tapdisk images. They have to be fixed
+                * because they use big endian rather than native endian for
+                * the L1 table.  After this block, the l1 table will
+                * definitely be in BIG endian.
+                */
+               if (!(flags & EXTHDR_L1_BIG_ENDIAN)) {
+                       DPRINTF("qcow: converting to big endian L1 table\n");
+
+                       /* convert to big endian */
+                       for (i = 0; i < s->l1_size; i++)
+                               cpu_to_be64s(&s->l1_table[i]);
+
+                       flags |= EXTHDR_L1_BIG_ENDIAN;
+                       exthdr->flags = cpu_to_be32(flags);
+
+                       memcpy(buf + s->l1_table_offset,
+                              s->l1_table, l1_table_size);
+                       
+                       err = lseek(s->fd, 0, SEEK_SET);
+                       if (err == (off_t)-1) {
+                               err = -errno;
+                               goto out;
+                       }
+
+                       err = atomicio(vwrite, s->fd, buf, l1_table_block);
+                       if (err != l1_table_block) {
+                               err = -errno;
+                               goto out;
+                       }
+               }
+
+               /* check the L1 table checksum */
+               if (cksum != gen_cksum((char *)s->l1_table,
+                                      s->l1_size * sizeof(uint64_t)))
+                       DPRINTF("qcow: bad L1 checksum\n");
+               else {
+                       s->extended = 1;
+                       s->sparse = (be32_to_cpu(exthdr->flags) & SPARSE_FILE);
+                       s->min_cluster_alloc =
+                               be32_to_cpu(exthdr->min_cluster_alloc);
+               }
+       }
+
+       /* convert L1 table to native endian for operation */
+       for (i = 0; i < s->l1_size; i++)
+               be64_to_cpus(&s->l1_table[i]);
+
+       err = 0;
+
+out:
+       if (err) {
+               free(buf);
+               free(s->l1_table);
+               s->l1_table = NULL;
+       }
+       return err;
+}
+
+/* Open the disk file and initialize qcow state. */
+int tdqcow_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       int fd, len, i, ret, size, o_flags;
+       td_disk_info_t *bs = &(driver->info);
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       QCowHeader header;
+       uint64_t final_cluster = 0;
+
+       DPRINTF("QCOW: Opening %s\n", name);
+
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+       fd = open(name, o_flags);
+       if (fd < 0) {
+               DPRINTF("Unable to open %s (%d)\n", name, -errno);
+               return -1;
+       }
+
+       s->fd = fd;
+       s->name = strdup(name);
+       if (!s->name)
+               goto fail;
+
+       if (tdqcow_read_header(fd, &header))
+               goto fail;
+
+       if (header.magic != QCOW_MAGIC)
+               goto fail;
+
+       switch (header.version) {
+       case QCOW_VERSION:
+               break;
+       case 2:
+         //TODO: Port qcow2 to new blktap framework.
+         //            close(fd);
+         //            dd->drv = &tapdisk_qcow2;
+         //            return dd->drv->td_open(dd, name, flags);
+         goto fail;
+       default:
+               goto fail;
+       }
+
+       if (header.size <= 1 || header.cluster_bits < 9)
+               goto fail;
+       if (header.crypt_method > QCOW_CRYPT_AES)
+               goto fail;
+       s->crypt_method_header = header.crypt_method;
+       if (s->crypt_method_header)
+               s->encrypted = 1;
+       s->cluster_bits = header.cluster_bits;
+       s->cluster_size = 1 << s->cluster_bits;
+       s->cluster_sectors = 1 << (s->cluster_bits - 9);
+       s->l2_bits = header.l2_bits;
+       s->l2_size = 1 << s->l2_bits;
+       s->cluster_alloc = s->l2_size;
+       bs->size = header.size / 512;
+       s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+       s->backing_file_offset = header.backing_file_offset;
+       s->backing_file_size   = header.backing_file_size;
+
+       /* allocate and load l1 table */
+       if (tdqcow_load_l1_table(s, &header))
+               goto fail;
+
+       /* alloc L2 cache */
+       size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
+       ret = posix_memalign((void **)&s->l2_cache, 4096, size);
+       if(ret != 0) goto fail;
+
+       size = s->cluster_size;
+       ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
+       if(ret != 0) goto fail;
+
+       ret = posix_memalign((void **)&s->cluster_data, 4096, size);
+       if(ret != 0) goto fail;
+       s->cluster_cache_offset = -1;
+
+       if (s->backing_file_offset != 0)
+               s->cluster_alloc = 1; /*Cannot use pre-alloc*/
+
+        bs->sector_size = 512;
+        bs->info = 0;
+
+       for(i = 0; i < s->l1_size; i++)
+               if (s->l1_table[i] > final_cluster)
+                       final_cluster = s->l1_table[i];
+
+       if (init_aio_state(driver)!=0) {
+         DPRINTF("Unable to initialise AIO state\n");
+         free_aio_state(s);
+         goto fail;
+       }
+
+       if (!final_cluster)
+               s->fd_end = s->l1_table_offset +
+                       ((s->l1_size * sizeof(uint64_t) + 4095) & ~4095);
+       else {
+               s->fd_end = lseek(fd, 0, SEEK_END);
+               if (s->fd_end == (off_t)-1)
+                       goto fail;
+       }
+
+       return 0;
+       
+fail:
+       DPRINTF("QCOW Open failed\n");
+
+       free_aio_state(s);
+       free(s->l1_table);
+       free(s->l2_cache);
+       free(s->cluster_cache);
+       free(s->cluster_data);
+       close(fd);
+       return -1;
+}
+
+void tdqcow_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       int ret = 0, index_in_cluster, n, i;
+       uint64_t cluster_offset, sector, nb_sectors;
+       struct qcow_prv* prv;
+       td_request_t clone = treq;
+       char* buf = treq.buf;
+
+       sector     = treq.sec;
+       nb_sectors = treq.secs;
+
+       /*We store a local record of the request*/
+       while (nb_sectors > 0) {
+               cluster_offset = 
+                       get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
+               index_in_cluster = sector & (s->cluster_sectors - 1);
+               n = s->cluster_sectors - index_in_cluster;
+               if (n > nb_sectors)
+                       n = nb_sectors;
+
+               if (s->aio_free_count == 0) {
+                       td_complete_request(treq, -EBUSY);
+                       return;
+               }
+               
+               if(!cluster_offset) {
+            int i;
+            /* Forward entire request if possible. */
+            for(i=0; i<nb_sectors; i++)
+                if(get_cluster_offset(s, (sector+i) << 9, 0, 0, 0, 0))
+                    goto coalesce_failed;
+            treq.buf  = buf;
+            treq.sec  = sector;
+            treq.secs = nb_sectors;
+                       td_forward_request(treq);
+            return;
+coalesce_failed:            
+                       treq.buf  = buf;
+                       treq.sec  = sector;
+                       treq.secs = n;
+                       td_forward_request(treq);
+
+               } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+                       if (decompress_cluster(s, cluster_offset) < 0) {
+                               td_complete_request(treq, -EIO);
+                               goto done;
+                       }
+                       memcpy(buf, s->cluster_cache + index_in_cluster * 512, 
+                              512 * n);
+                       
+                       treq.buf  = buf;
+                       treq.sec  = sector;
+                       treq.secs = n;
+                       td_complete_request(treq, 0);
+               } else {
+                 clone.buf  = buf;
+                 clone.sec  = (cluster_offset>>9)+index_in_cluster;
+                 clone.secs = n;
+                 async_read(driver, clone);
+               }
+               nb_sectors -= n;
+               sector += n;
+               buf += n * 512;
+       }
+done:
+       return;
+}
+
+void tdqcow_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       int ret = 0, index_in_cluster, n, i;
+       uint64_t cluster_offset, sector, nb_sectors;
+       td_callback_t cb;
+       struct qcow_prv* prv;
+       char* buf = treq.buf;
+       td_request_t clone=treq;
+
+       sector     = treq.sec;
+       nb_sectors = treq.secs;
+                  
+       /*We store a local record of the request*/
+       while (nb_sectors > 0) {
+               index_in_cluster = sector & (s->cluster_sectors - 1);
+               n = s->cluster_sectors - index_in_cluster;
+               if (n > nb_sectors)
+                       n = nb_sectors;
+
+               if (s->aio_free_count == 0) {
+                       td_complete_request(treq, -EBUSY);
+                       return;
+               }
+
+               cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
+                                                   index_in_cluster, 
+                                                   index_in_cluster+n);
+               if (!cluster_offset) {
+                       DPRINTF("Ooops, no write cluster offset!\n");
+                       td_complete_request(treq, -EIO);
+                       return;
+               }
+
+               if (s->crypt_method) {
+                       encrypt_sectors(s, sector, s->cluster_data, 
+                                       (unsigned char *)buf, n, 1,
+                                       &s->aes_encrypt_key);
+
+                       clone.buf  = buf;
+                       clone.sec  = (cluster_offset>>9) + index_in_cluster;
+                       clone.secs = n;
+                       async_write(driver, clone);
+               } else {
+                 clone.buf  = buf;
+                 clone.sec  = (cluster_offset>>9) + index_in_cluster;
+                 clone.secs = n;
+
+                 async_write(driver, clone);
+               }
+               
+               nb_sectors -= n;
+               sector += n;
+               buf += n * 512;
+       }
+       s->cluster_cache_offset = -1; /* disable compressed cache */
+
+       return;
+}
+
+static int
+tdqcow_update_checksum(struct tdqcow_state *s)
+{
+       int i, fd, err;
+       uint32_t offset, cksum, out;
+
+       if (!s->extended)
+               return 0;
+
+       fd = open(s->name, O_WRONLY | O_LARGEFILE); /* open without O_DIRECT */
+       if (fd == -1) {
+               err = errno;
+               goto out;
+       }
+
+       offset = sizeof(QCowHeader) + offsetof(QCowHeader_ext, cksum);
+       if (lseek(fd, offset, SEEK_SET) == (off_t)-1) {
+               err = errno;
+               goto out;
+       }
+
+       /* convert to big endian for checksum */
+       for (i = 0; i < s->l1_size; i++)
+               cpu_to_be64s(&s->l1_table[i]);
+
+       cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
+
+       /* and back again... */
+       for (i = 0; i < s->l1_size; i++)
+               be64_to_cpus(&s->l1_table[i]);
+
+       DPRINTF("Writing cksum: %d", cksum);
+
+       out = cpu_to_be32(cksum);
+       if (write(fd, &out, sizeof(out)) != sizeof(out)) {
+               err = errno;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       if (err)
+               DPRINTF("failed to update checksum: %d\n", err);
+       if (fd != -1)
+               close(fd);
+       return err;
+}
+               
+int tdqcow_close(td_driver_t *driver)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+
+       /*Update the hdr cksum*/
+       tdqcow_update_checksum(s);
+
+       free_aio_state(s);
+       free(s->name);
+       free(s->l1_table);
+       free(s->l2_cache);
+       free(s->cluster_cache);
+       free(s->cluster_data);
+       close(s->fd);   
+       return 0;
+}
+
+int qcow_create(const char *filename, uint64_t total_size,
+               const char *backing_file, int sparse)
+{
+       int fd, header_size, backing_filename_len, l1_size, i;
+       int shift, length, adjust, flags = 0, ret = 0;
+       QCowHeader header;
+       QCowHeader_ext exthdr;
+       char backing_filename[PATH_MAX], *ptr;
+       uint64_t tmp, size, total_length;
+       struct stat st;
+
+       DPRINTF("Qcow_create: size %"PRIu64"\n",total_size);
+
+       fd = open(filename, 
+                 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+                 0644);
+       if (fd < 0)
+               return -1;
+
+       memset(&header, 0, sizeof(header));
+       header.magic = cpu_to_be32(QCOW_MAGIC);
+       header.version = cpu_to_be32(QCOW_VERSION);
+
+       /*Create extended header fields*/
+       exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
+
+       header_size = sizeof(header) + sizeof(QCowHeader_ext);
+       backing_filename_len = 0;
+       size = (total_size >> SECTOR_SHIFT);
+       if (backing_file) {
+               if (strcmp(backing_file, "fat:")) {
+                       const char *p;
+                       /* XXX: this is a hack: we do not attempt to 
+                        *check for URL like syntax */
+                       p = strchr(backing_file, ':');
+                       if (p && (p - backing_file) >= 2) {
+                               /* URL like but exclude "c:" like filenames */
+                               strncpy(backing_filename, backing_file,
+                                       sizeof(backing_filename));
+                       } else {
+                               if (realpath(backing_file, backing_filename) == NULL ||
+                                   stat(backing_filename, &st) != 0) {
+                                       return -1;
+                               }
+                       }
+                       header.backing_file_offset = cpu_to_be64(header_size);
+                       backing_filename_len = strlen(backing_filename);
+                       header.backing_file_size = cpu_to_be32(
+                               backing_filename_len);
+                       header_size += backing_filename_len;
+                       
+                       /*Set to the backing file size*/
+                       if(get_filesize(backing_filename, &size, &st)) {
+                               return -1;
+                       }
+                       DPRINTF("Backing file size detected: %"PRId64" sectors" 
+                               "(total %"PRId64" [%"PRId64" MB])\n", 
+                               size, 
+                               (uint64_t)(size << SECTOR_SHIFT), 
+                               (uint64_t)(size >> 11));
+               } else {
+                       backing_file = NULL;
+                       DPRINTF("Setting file size: %"PRId64" (total %"PRId64")\n", 
+                               total_size, 
+                               (uint64_t) (total_size << SECTOR_SHIFT));
+               }
+               header.mtime = cpu_to_be32(st.st_mtime);
+               header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+                                           unmodifyed sectors */
+               header.l2_bits = 12; /* 32 KB L2 tables */
+               exthdr.min_cluster_alloc = cpu_to_be32(1);
+       } else {
+               DPRINTF("Setting file size: %"PRId64" sectors" 
+                       "(total %"PRId64" [%"PRId64" MB])\n", 
+                       size, 
+                       (uint64_t) (size << SECTOR_SHIFT), 
+                       (uint64_t) (size >> 11));
+               header.cluster_bits = 12; /* 4 KB clusters */
+               header.l2_bits = 9; /* 4 KB L2 tables */
+               exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
+       }
+       /*Set the header size value*/
+       header.size = cpu_to_be64(size * 512);
+       
+       header_size = (header_size + 7) & ~7;
+       if (header_size % 4096 > 0) {
+               header_size = ((header_size >> 12) + 1) << 12;
+       }
+
+       shift = header.cluster_bits + header.l2_bits;
+       l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
+
+       header.l1_table_offset = cpu_to_be64(header_size);
+       DPRINTF("L1 Table offset: %d, size %d\n",
+               header_size,
+               (int)(l1_size * sizeof(uint64_t)));
+       header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+
+       ptr = calloc(1, l1_size * sizeof(uint64_t));
+       exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
+       printf("Created cksum: %d\n",exthdr.cksum);
+       free(ptr);
+
+       /*adjust file length to system page size boundary*/
+       length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
+               getpagesize());
+       if (qtruncate(fd, length, 0)!=0) {
+               DPRINTF("ERROR truncating file\n");
+               return -1;
+       }
+
+       if (sparse == 0) {
+               /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
+               total_length = length + (l1_size * (1 << 9)) + (size * 512);
+               if (qtruncate(fd, total_length, 0)!=0) {
+                        DPRINTF("ERROR truncating file\n");
+                        return -1;
+               }
+               printf("File truncated to length %"PRIu64"\n",total_length);
+       } else
+               flags = SPARSE_FILE;
+
+       flags |= EXTHDR_L1_BIG_ENDIAN;
+       exthdr.flags = cpu_to_be32(flags);
+       
+       /* write all the data */
+       lseek(fd, 0, SEEK_SET);
+       ret += write(fd, &header, sizeof(header));
+       ret += write(fd, &exthdr, sizeof(exthdr));
+       if (backing_file)
+               ret += write(fd, backing_filename, backing_filename_len);
+
+       lseek(fd, header_size, SEEK_SET);
+       tmp = 0;
+       for (i = 0;i < l1_size; i++) {
+               ret += write(fd, &tmp, sizeof(tmp));
+       }
+
+       close(fd);
+
+       return 0;
+}
+
+static int qcow_make_empty(struct tdqcow_state *s)
+{
+       uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+
+       memset(s->l1_table, 0, l1_length);
+       lseek(s->fd, s->l1_table_offset, SEEK_SET);
+       if (write(s->fd, s->l1_table, l1_length) < 0)
+               return -1;
+       if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
+               DPRINTF("ERROR truncating file\n");
+               return -1;
+       }
+
+       memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+       memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+       memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+       return 0;
+}
+
+static int qcow_get_cluster_size(struct tdqcow_state *s)
+{
+       return s->cluster_size;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num, 
+                          const uint8_t *buf)
+{
+       z_stream strm;
+       int ret, out_len;
+       uint8_t *out_buf;
+       uint64_t cluster_offset;
+
+       out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+       if (!out_buf)
+               return -1;
+
+       /* best compression, small window, no zlib header */
+       memset(&strm, 0, sizeof(strm));
+       ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                          Z_DEFLATED, -12, 
+                          9, Z_DEFAULT_STRATEGY);
+       if (ret != 0) {
+               free(out_buf);
+               return -1;
+       }
+
+       strm.avail_in = s->cluster_size;
+       strm.next_in = (uint8_t *)buf;
+       strm.avail_out = s->cluster_size;
+       strm.next_out = out_buf;
+
+       ret = deflate(&strm, Z_FINISH);
+       if (ret != Z_STREAM_END && ret != Z_OK) {
+               free(out_buf);
+               deflateEnd(&strm);
+               return -1;
+       }
+       out_len = strm.next_out - out_buf;
+
+       deflateEnd(&strm);
+
+       if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+               /* could not compress: write normal cluster */
+               //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
+       } else {
+               cluster_offset = get_cluster_offset(s, sector_num << 9, 2, 
+                                            out_len, 0, 0);
+               cluster_offset &= s->cluster_offset_mask;
+               lseek(s->fd, cluster_offset, SEEK_SET);
+               if (write(s->fd, out_buf, out_len) != out_len) {
+                       free(out_buf);
+                       return -1;
+               }
+       }
+       
+       free(out_buf);
+       return 0;
+}
+
+static int
+tdqcow_get_image_type(const char *file, int *type)
+{
+       int fd;
+       size_t size;
+       QCowHeader header;
+
+       fd = open(file, O_RDONLY);
+       if (fd == -1)
+               return -errno;
+
+       size = read(fd, &header, sizeof(header));
+       close(fd);
+       if (size != sizeof(header))
+               return (errno ? -errno : -EIO);
+
+       be32_to_cpus(&header.magic);
+       if (header.magic == QCOW_MAGIC)
+               *type = DISK_TYPE_QCOW;
+       else
+               *type = DISK_TYPE_AIO;
+
+       return 0;
+}
+
+int tdqcow_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       off_t off;
+       char *buf, *filename;
+       int len, secs, type = 0, err = -EINVAL;
+       struct tdqcow_state *child  = (struct tdqcow_state *)driver->data;
+
+       if (!child->backing_file_offset)
+               return TD_NO_PARENT;
+
+       /* read the backing file name */
+       len  = child->backing_file_size;
+       off  = child->backing_file_offset - (child->backing_file_offset % 512);
+       secs = (len + (child->backing_file_offset - off) + 511) >> 9;
+
+       if (posix_memalign((void **)&buf, 512, secs << 9)) 
+               return -1;
+
+       if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
+               goto out;
+
+       if (read(child->fd, buf, secs << 9) != secs << 9)
+               goto out;
+       filename       = buf + (child->backing_file_offset - off);
+       filename[len]  = '\0';
+
+       if (tdqcow_get_image_type(filename, &type))
+               goto out;
+
+       id->name       = strdup(filename);
+       id->drivertype = type;
+       err            = 0;
+ out:
+       free(buf);
+       return err;
+}
+
+int tdqcow_validate_parent(td_driver_t *driver,
+                         td_driver_t *pdriver, td_flag_t flags)
+{
+       struct stat stats;
+       uint64_t psize, csize;
+       struct tdqcow_state *c = (struct tdqcow_state *)driver->data;
+       struct tdqcow_state *p = (struct tdqcow_state *)pdriver->data;
+       
+       if (stat(p->name, &stats))
+               return -EINVAL;
+       if (get_filesize(p->name, &psize, &stats))
+               return -EINVAL;
+
+       if (stat(c->name, &stats))
+               return -EINVAL;
+       if (get_filesize(c->name, &csize, &stats))
+               return -EINVAL;
+
+       if (csize != psize)
+               return -EINVAL;
+
+       return 0;
+}
+
+struct tap_disk tapdisk_qcow = {
+       .disk_type           = "tapdisk_qcow",
+       .flags              = 0,
+       .private_data_size   = sizeof(struct tdqcow_state),
+       .td_open             = tdqcow_open,
+       .td_close            = tdqcow_close,
+       .td_queue_read       = tdqcow_queue_read,
+       .td_queue_write      = tdqcow_queue_write,
+       .td_get_parent_id    = tdqcow_get_parent_id,
+       .td_validate_parent  = tdqcow_validate_parent,
+       .td_debug           = NULL,
+};
diff --git a/tools/blktap2/drivers/block-ram.c b/tools/blktap2/drivers/block-ram.c
new file mode 100644 (file)
index 0000000..a859481
--- /dev/null
@@ -0,0 +1,256 @@
+/* 
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+
+#include "blk.h"
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+char *img;
+long int   disksector_size;
+long int   disksize;
+long int   diskinfo;
+static int connections = 0;
+
+struct tdram_state {
+        int fd;
+};
+
+/*Get Image size, secsize*/
+static int get_image_info(int fd, td_disk_info_t *info)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               info->size = 0;
+               if (blk_getimagesize(fd, &info->size) != 0)
+                       return -EINVAL;
+
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+
+               /*Get the sector size*/
+               if (blk_getsectorsize(fd, &info->sector_size) != 0)
+                       info->sector_size = DEFAULT_SECTOR_SIZE;
+
+       } else {
+               /*Local file? try fstat instead*/
+               info->size = (stat.st_size >> SECTOR_SHIFT);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+       }
+
+       if (info->size == 0) {          
+               info->size =((uint64_t) MAX_RAMDISK_SIZE);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+       }
+       info->info = 0;
+
+        /*Store variables locally*/
+       disksector_size = info->sector_size;
+       disksize        = info->size;
+       diskinfo        = info->info;
+       DPRINTF("Image sector_size: \n\t[%"PRIu64"]\n",
+               info->sector_size);
+
+       return 0;
+}
+
+/* Open the disk file and initialize ram state. */
+int tdram_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       char *p;
+       uint64_t size;
+       int i, fd, ret = 0, count = 0, o_flags;
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+
+       connections++;
+
+       if (connections > 1) {
+               driver->info.sector_size = disksector_size;
+               driver->info.size        = disksize;
+               driver->info.info        = diskinfo; 
+               DPRINTF("Image already open, returning parameters:\n");
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(driver->info.size << SECTOR_SHIFT),
+                       (long long unsigned)driver->info.size);
+               DPRINTF("Image sector_size: \n\t[%"PRIu64"]\n",
+                       driver->info.sector_size);
+
+               prv->fd = -1;
+               goto done;
+       }
+
+       /* Open the file */
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+        fd = open(name, o_flags);
+
+        if ((fd == -1) && (errno == EINVAL)) {
+
+                /* Maybe O_DIRECT isn't supported. */
+               o_flags &= ~O_DIRECT;
+                fd = open(name, o_flags);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s]!\n",name);
+               ret = 0 - errno;
+               goto done;
+        }
+
+        prv->fd = fd;
+
+       ret = get_image_info(fd, &driver->info);
+       size = MAX_RAMDISK_SIZE;
+
+       if (driver->info.size > size) {
+               DPRINTF("Disk exceeds limit, must be less than [%d]MB",
+                       (MAX_RAMDISK_SIZE<<SECTOR_SHIFT)>>20);
+               return -ENOMEM;
+       }
+
+       /*Read the image into memory*/
+       if (posix_memalign((void **)&img, 
+                          DEFAULT_SECTOR_SIZE,
+                          driver->info.size << SECTOR_SHIFT)) {
+               DPRINTF("Mem malloc failed\n");
+               return -errno;
+       }
+       p = img;
+       DPRINTF("Reading %llu bytes.......",
+               (long long unsigned)driver->info.size << SECTOR_SHIFT);
+
+       for (i = 0; i < driver->info.size; i++) {
+               ret = read(prv->fd, p, driver->info.sector_size);
+               if (ret != driver->info.sector_size) {
+                       DPRINTF("ret = %d, errno = %d\n", ret, errno);
+                       ret = 0 - errno;
+                       break;
+               } else {
+                       count += ret;
+                       p = img + count;
+               }
+       }
+       DPRINTF("[%d]\n",count);
+       if (count != driver->info.size << SECTOR_SHIFT) {
+               ret = -1;
+       } else {
+               ret = 0;
+       }
+
+done:
+       return ret;
+}
+
+void tdram_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+       int      size    = treq.secs * driver->info.sector_size;
+       uint64_t offset  = treq.sec * (uint64_t)driver->info.sector_size;
+
+       memcpy(treq.buf, img + offset, size);
+
+       td_complete_request(treq, 0);
+}
+
+void tdram_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+       int      size    = treq.secs * driver->info.sector_size;
+       uint64_t offset  = treq.sec * (uint64_t)driver->info.sector_size;
+       
+       /* We assume that write access is controlled
+        * at a higher level for multiple disks */
+       memcpy(img + offset, treq.buf, size);
+
+       td_complete_request(treq, 0);
+}
+
+int tdram_close(td_driver_t *driver)
+{
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+       
+       connections--;
+       
+       return 0;
+}
+
+int tdram_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       return TD_NO_PARENT;
+}
+
+int tdram_validate_parent(td_driver_t *driver,
+                         td_driver_t *pdriver, td_flag_t flags)
+{
+       return -EINVAL;
+}
+
+struct tap_disk tapdisk_ram = {
+       .disk_type          = "tapdisk_ram",
+       .flags              = 0,
+       .private_data_size  = sizeof(struct tdram_state),
+       .td_open            = tdram_open,
+       .td_close           = tdram_close,
+       .td_queue_read      = tdram_queue_read,
+       .td_queue_write     = tdram_queue_write,
+       .td_get_parent_id   = tdram_get_parent_id,
+       .td_validate_parent = tdram_validate_parent,
+       .td_debug           = NULL,
+};
diff --git a/tools/blktap2/drivers/block-remus.c b/tools/blktap2/drivers/block-remus.c
new file mode 100644 (file)
index 0000000..079588d
--- /dev/null
@@ -0,0 +1,1733 @@
+/* block-remus.c
+ *
+ * This disk sends all writes to a backup via a network interface before
+ * passing them to an underlying device.
+ * The backup is a bit more complicated:
+ *  1. It applies all incoming writes to a ramdisk.
+ *  2. When a checkpoint request arrives, it moves the ramdisk to
+ *     a committing state and uses a new ramdisk for subsequent writes.
+ *     It also acknowledges the request, to let the sender know it can
+ *     release output.
+ *  3. The ramdisk flushes its contents to the underlying driver.
+ *  4. At failover, the backup waits for the in-flight ramdisk (if any) to
+ *     drain before letting the domain be activated.
+ *
+ * The driver determines whether it is the client or server by attempting
+ * to bind to the replication address. If the address is not local,
+ * the driver acts as client.
+ *
+ * The following messages are defined for the replication stream:
+ * 1. write request
+ *    "wreq"      4
+ *    num_sectors 4
+ *    sector      8
+ *    buffer      (num_sectors * sector_size)
+ * 2. submit request (may be used as a barrier
+ *    "sreq"      4
+ * 3. commit request
+ *    "creq"      4
+ * After a commit request, the client must wait for a competion message:
+ * 4. completion
+ *    "done"      4
+ */
+
+/* due to architectural choices in tapdisk, block-buffer is forced to
+ * reimplement some code which is meant to be private */
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "hashtable.h"
+#include "hashtable_itr.h"
+#include "hashtable_utility.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+/* timeout for reads and writes in ms */
+#define HEARTBEAT_MS 1000
+#define RAMDISK_HASHSIZE 128
+
+/* connect retry timeout (seconds) */
+#define REMUS_CONNRETRY_TIMEOUT 10
+
+#define RPRINTF(_f, _a...) syslog (LOG_DEBUG, "remus: " _f, ## _a)
+
+enum tdremus_mode {
+       mode_invalid = 0,
+       mode_unprotected,
+       mode_primary,
+       mode_backup
+};
+
+struct tdremus_req {
+       uint64_t sector;
+       int nb_sectors;
+       char buf[4096];
+};
+
+struct req_ring {
+       /* waste one slot to distinguish between empty and full */
+       struct tdremus_req requests[MAX_REQUESTS * 2 + 1];
+       unsigned int head;
+       unsigned int tail;
+};
+
+/* TODO: This isn't very pretty, but to properly generate our own treqs (needed
+ * by the backup) we need to know our td_vbt_t and td_image_t (blktap2
+ * internals). As a proper fix, we should consider extending the tapdisk
+ * interface with a td_create_request() function, or something similar.
+ *
+ * For now, we just grab the vbd in the td_open() command, and the td_image_t
+ * from the first read request.
+ */
+td_vbd_t *device_vbd = NULL;
+td_image_t *remus_image = NULL;
+struct tap_disk tapdisk_remus;
+
+struct ramdisk {
+       size_t sector_size;
+       struct hashtable* h;
+       /* when a ramdisk is flushed, h is given a new empty hash for writes
+        * while the old ramdisk (prev) is drained asynchronously.
+        */
+       struct hashtable* prev;
+       /* count of outstanding requests to the base driver */
+       size_t inflight;
+       /* prev holds the requests to be flushed, while inprogress holds
+        * requests being flushed. When requests complete, they are removed
+        * from inprogress.
+        * Whenever a new flush is merged with ongoing flush (i.e, prev),
+        * we have to make sure that none of the new requests overlap with
+        * ones in "inprogress". If it does, keep it back in prev and dont issue
+        * IO until the current one finishes. If we allow this IO to proceed,
+        * we might end up with two "overlapping" requests in the disk's queue and
+        * the disk may not offer any guarantee on which one is written first.
+        * IOW, make sure we dont create a write-after-write time ordering constraint.
+        * 
+        */
+       struct hashtable* inprogress;
+};
+
+/* the ramdisk intercepts the original callback for reads and writes.
+ * This holds the original data. */
+/* Might be worth making this a static array in struct ramdisk to avoid
+ * a malloc per request */
+
+struct tdremus_state;
+
+struct ramdisk_cbdata {
+       td_callback_t cb;
+       void* private;
+       char* buf;
+       struct tdremus_state* state;
+};
+
+struct ramdisk_write_cbdata {
+       struct tdremus_state* state;
+       char* buf;
+};
+
+typedef void (*queue_rw_t) (td_driver_t *driver, td_request_t treq);
+
+/* poll_fd type for blktap2 fd system. taken from block_log.c */
+typedef struct poll_fd {
+       int        fd;
+       event_id_t id;
+} poll_fd_t;
+
+struct tdremus_state {
+//  struct tap_disk* driver;
+       void* driver_data;
+
+  /* XXX: this is needed so that the server can perform operations on
+   * the driver from the stream_fd event handler. fix this. */
+       td_driver_t *tdremus_driver;
+
+       /* TODO: we may wish to replace these two FIFOs with a unix socket */
+       char*     ctl_path; /* receive flush instruction here */
+       poll_fd_t ctl_fd;     /* io_fd slot for control FIFO */
+       char*     msg_path; /* output completion message here */
+       poll_fd_t msg_fd;
+
+  /* replication host */
+       struct sockaddr_in sa;
+       poll_fd_t server_fd;    /* server listen port */
+       poll_fd_t stream_fd;     /* replication channel */
+
+       /* queue write requests, batch-replicate at submit */
+       struct req_ring write_ring;
+
+       /* ramdisk data*/
+       struct ramdisk ramdisk;
+
+       /* mode methods */
+       enum tdremus_mode mode;
+       int (*queue_flush)(td_driver_t *driver);
+};
+
+typedef struct tdremus_wire {
+       uint32_t op;
+       uint64_t id;
+       uint64_t sec;
+       uint32_t secs;
+} tdremus_wire_t;
+
+#define TDREMUS_READ "rreq"
+#define TDREMUS_WRITE "wreq"
+#define TDREMUS_SUBMIT "sreq"
+#define TDREMUS_COMMIT "creq"
+#define TDREMUS_DONE "done"
+#define TDREMUS_FAIL "fail"
+
+/* primary read/write functions */
+static void primary_queue_read(td_driver_t *driver, td_request_t treq);
+static void primary_queue_write(td_driver_t *driver, td_request_t treq);
+
+/* backup read/write functions */
+static void backup_queue_read(td_driver_t *driver, td_request_t treq);
+static void backup_queue_write(td_driver_t *driver, td_request_t treq);
+
+/* unpritected read/write functions */
+static void unprotected_queue_read(td_driver_t *driver, td_request_t treq);
+static void unprotected_queue_write(td_driver_t *driver, td_request_t treq);
+
+static int tdremus_close(td_driver_t *driver);
+
+static int switch_mode(td_driver_t *driver, enum tdremus_mode mode);
+static int ctl_respond(struct tdremus_state *s, const char *response);
+
+/* ring functions */
+static inline unsigned int ring_next(struct req_ring* ring, unsigned int pos)
+{
+       if (++pos >= MAX_REQUESTS * 2 + 1)
+               return 0;
+
+       return pos;
+}
+
+static inline int ring_isempty(struct req_ring* ring)
+{
+       return ring->head == ring->tail;
+}
+
+static inline int ring_isfull(struct req_ring* ring)
+{
+       return ring_next(ring, ring->tail) == ring->head;
+}
+/* Prototype declarations */
+static int ramdisk_flush(td_driver_t *driver, struct tdremus_state* s);
+
+/* functions to create and sumbit treq's */
+
+static void
+replicated_write_callback(td_request_t treq, int err)
+{
+       struct tdremus_state *s = (struct tdremus_state *) treq.cb_data;
+       td_vbd_request_t *vreq;
+       int i;
+       uint64_t start;
+       vreq = (td_vbd_request_t *) treq.private;
+
+       /* the write failed for now, lets panic. this is very bad */
+       if (err) {
+               RPRINTF("ramdisk write failed, disk image is not consistent\n");
+               exit(-1);
+       }
+
+       /* The write succeeded. let's pull the vreq off whatever request list
+        * it is on and free() it */
+       list_del(&vreq->next);
+       free(vreq);
+
+       s->ramdisk.inflight--;
+       start = treq.sec;
+       for (i = 0; i < treq.secs; i++) {
+               hashtable_remove(s->ramdisk.inprogress, &start);
+               start++;
+       }
+       free(treq.buf);
+
+       if (!s->ramdisk.inflight && !s->ramdisk.prev) {
+               /* TODO: the ramdisk has been flushed */
+       }
+}
+
+static inline int
+create_write_request(struct tdremus_state *state, td_sector_t sec, int secs, char *buf)
+{
+       td_request_t treq;
+       td_vbd_request_t *vreq;
+
+       treq.op      = TD_OP_WRITE;
+       treq.buf     = buf;
+       treq.sec     = sec;
+       treq.secs    = secs;
+       treq.image   = remus_image;
+       treq.cb      = replicated_write_callback;
+       treq.cb_data = state;
+       treq.id      = 0;
+       treq.sidx    = 0;
+
+       vreq         = calloc(1, sizeof(td_vbd_request_t));
+       treq.private = vreq;
+
+       if(!vreq)
+               return -1;
+
+       vreq->submitting = 1;
+       INIT_LIST_HEAD(&vreq->next);
+       tapdisk_vbd_move_request(treq.private, &device_vbd->pending_requests);
+
+       /* TODO:
+        * we should probably leave it up to the caller to forward the request */
+       td_forward_request(treq);
+
+       vreq->submitting--;
+
+       return 0;
+}
+
+
+/* http://www.concentric.net/~Ttwang/tech/inthash.htm */
+static unsigned int uint64_hash(void* k)
+{
+       uint64_t key = *(uint64_t*)k;
+
+       key = (~key) + (key << 18);
+       key = key ^ (key >> 31);
+       key = key * 21;
+       key = key ^ (key >> 11);
+       key = key + (key << 6);
+       key = key ^ (key >> 22);
+
+       return (unsigned int)key;
+}
+
+static int rd_hash_equal(void* k1, void* k2)
+{
+       uint64_t key1, key2;
+
+       key1 = *(uint64_t*)k1;
+       key2 = *(uint64_t*)k2;
+
+       return key1 == key2;
+}
+
+static int ramdisk_read(struct ramdisk* ramdisk, uint64_t sector,
+                       int nb_sectors, char* buf)
+{
+       int i;
+       char* v;
+       uint64_t key;
+
+       for (i = 0; i < nb_sectors; i++) {
+               key = sector + i;
+               /* check whether it is queued in a previous flush request */
+               if (!(ramdisk->prev && (v = hashtable_search(ramdisk->prev, &key)))) {
+                       /* check whether it is an ongoing flush */
+                       if (!(ramdisk->inprogress && (v = hashtable_search(ramdisk->inprogress, &key))))
+                               return -1;
+               }
+               memcpy(buf + i * ramdisk->sector_size, v, ramdisk->sector_size);
+       }
+
+       return 0;
+}
+
+static int ramdisk_write_hash(struct hashtable* h, uint64_t sector, char* buf,
+                             size_t len)
+{
+       char* v;
+       uint64_t* key;
+
+       if ((v = hashtable_search(h, &sector))) {
+               memcpy(v, buf, len);
+               return 0;
+       }
+
+       if (!(v = malloc(len))) {
+               DPRINTF("ramdisk_write_hash: malloc failed\n");
+               return -1;
+       }
+       memcpy(v, buf, len);
+       if (!(key = malloc(sizeof(*key)))) {
+               DPRINTF("ramdisk_write_hash: error allocating key\n");
+               free(v);
+               return -1;
+       }
+       *key = sector;
+       if (!hashtable_insert(h, key, v)) {
+               DPRINTF("ramdisk_write_hash failed on sector %" PRIu64 "\n", sector);
+               free(key);
+               free(v);
+               return -1;
+       }
+
+       return 0;
+}
+
+static inline int ramdisk_write(struct ramdisk* ramdisk, uint64_t sector,
+                               int nb_sectors, char* buf)
+{
+       int i, rc;
+
+       for (i = 0; i < nb_sectors; i++) {
+               rc = ramdisk_write_hash(ramdisk->h, sector + i,
+                                       buf + i * ramdisk->sector_size,
+                                       ramdisk->sector_size);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
+
+static int uint64_compare(const void* k1, const void* k2)
+{
+       uint64_t u1 = *(uint64_t*)k1;
+       uint64_t u2 = *(uint64_t*)k2;
+
+       /* u1 - u2 is unsigned */
+       return u1 < u2 ? -1 : u1 > u2 ? 1 : 0;
+}
+
+/* set psectors to an array of the sector numbers in the hash, returning
+ * the number of entries (or -1 on error) */
+static int ramdisk_get_sectors(struct hashtable* h, uint64_t** psectors)
+{
+       struct hashtable_itr* itr;
+       uint64_t* sectors;
+       int count;
+
+       if (!(count = hashtable_count(h)))
+               return 0;
+
+       if (!(*psectors = malloc(count * sizeof(uint64_t)))) {
+               DPRINTF("ramdisk_get_sectors: error allocating sector map\n");
+               return -1;
+       }
+       sectors = *psectors;
+
+       itr = hashtable_iterator(h);
+       count = 0;
+       do {
+               sectors[count++] = *(uint64_t*)hashtable_iterator_key(itr);
+       } while (hashtable_iterator_advance(itr));
+       free(itr);
+
+       return count;
+}
+
+/*
+  return -1 for OOM
+  return -2 for merge lookup failure
+  return -3 for WAW race
+  return 0 on success.
+*/
+static int merge_requests(struct ramdisk* ramdisk, uint64_t start,
+                       size_t count, char **mergedbuf)
+{
+       char* buf;
+       char* sector;
+       int i;
+       uint64_t *key;
+       int rc = 0;
+
+       if (!(buf = valloc(count * ramdisk->sector_size))) {
+               DPRINTF("merge_request: allocation failed\n");
+               return -1;
+       }
+
+       for (i = 0; i < count; i++) {
+               if (!(sector = hashtable_search(ramdisk->prev, &start))) {
+                       DPRINTF("merge_request: lookup failed on %"PRIu64"\n", start);
+                       free(buf);
+                       rc = -2;
+                       goto fail;
+               }
+
+               /* Check inprogress requests to avoid waw non-determinism */
+               if (hashtable_search(ramdisk->inprogress, &start)) {
+                       DPRINTF("merge_request: WAR RACE on %"PRIu64"\n", start);
+                       free(buf);
+                       rc = -3;
+                       goto fail;
+               }
+               /* Insert req into inprogress (brief period of duplication of hash entries until
+                * they are removed from prev. Read tracking would not be reading wrong entries)
+                */
+               if (!(key = malloc(sizeof(*key)))) {
+                       DPRINTF("%s: error allocating key\n", __FUNCTION__);
+                       free(buf);                      
+                       rc = -1;
+                       goto fail;
+               }
+               *key = start;
+               if (!hashtable_insert(ramdisk->inprogress, key, NULL)) {
+                       DPRINTF("%s failed to insert sector %" PRIu64 " into inprogress hash\n", 
+                               __FUNCTION__, start);
+                       free(key);
+                       free(buf);
+                       rc = -1;
+                       goto fail;
+               }
+               memcpy(buf + i * ramdisk->sector_size, sector, ramdisk->sector_size);
+               start++;
+       }
+
+       *mergedbuf = buf;
+       return 0;
+fail:
+       for (start--; i >0; i--, start--)
+               hashtable_remove(ramdisk->inprogress, &start);
+       return rc;
+}
+
+/* The underlying driver may not handle having the whole ramdisk queued at
+ * once. We queue what we can and let the callbacks attempt to queue more. */
+/* NOTE: may be called from callback, while dd->private still belongs to
+ * the underlying driver */
+static int ramdisk_flush(td_driver_t *driver, struct tdremus_state* s)
+{
+       uint64_t* sectors;
+       char* buf = NULL;
+       uint64_t base, batchlen;
+       int i, j, count = 0;
+
+       // RPRINTF("ramdisk flush\n");
+
+       if ((count = ramdisk_get_sectors(s->ramdisk.prev, &sectors)) <= 0)
+               return count;
+
+       /* Create the inprogress table if empty */
+       if (!s->ramdisk.inprogress)
+               s->ramdisk.inprogress = create_hashtable(RAMDISK_HASHSIZE,
+                                                       uint64_hash,
+                                                       rd_hash_equal);
+       
+       /*
+         RPRINTF("ramdisk: flushing %d sectors\n", count);
+       */
+
+       /* sort and merge sectors to improve disk performance */
+       qsort(sectors, count, sizeof(*sectors), uint64_compare);
+
+       for (i = 0; i < count;) {
+               base = sectors[i++];
+               while (i < count && sectors[i] == sectors[i-1] + 1)
+                       i++;
+               batchlen = sectors[i-1] - base + 1;
+
+               j = merge_requests(&s->ramdisk, base, batchlen, &buf);
+                       
+               if (j) {
+                       RPRINTF("ramdisk_flush: merge_requests failed:%s\n",
+                               j == -1? "OOM": (j==-2? "missing sector" : "WAW race"));
+                       if (j == -3) continue;
+                       free(sectors);
+                       return -1;
+               }
+
+               /* NOTE: create_write_request() creates a treq AND forwards it down
+                * the driver chain */
+               // RPRINTF("forwarding write request at %" PRIu64 ", length: %" PRIu64 "\n", base, batchlen);
+               create_write_request(s, base, batchlen, buf);
+               //RPRINTF("write request at %" PRIu64 ", length: %" PRIu64 " forwarded\n", base, batchlen);
+
+               s->ramdisk.inflight++;
+
+               for (j = 0; j < batchlen; j++) {
+                       buf = hashtable_search(s->ramdisk.prev, &base);
+                       free(buf);
+                       hashtable_remove(s->ramdisk.prev, &base);
+                       base++;
+               }
+       }
+
+       if (!hashtable_count(s->ramdisk.prev)) {
+               /* everything is in flight */
+               hashtable_destroy(s->ramdisk.prev, 0);
+               s->ramdisk.prev = NULL;
+       }
+
+       free(sectors);
+
+       // RPRINTF("ramdisk flush done\n");
+       return 0;
+}
+
+/* flush ramdisk contents to disk */
+static int ramdisk_start_flush(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+       uint64_t* key;
+       char* buf;
+       int rc = 0;
+       int i, j, count, batchlen;
+       uint64_t* sectors;
+
+       if (!hashtable_count(s->ramdisk.h)) {
+               /*
+                 RPRINTF("Nothing to flush\n");
+               */
+               return 0;
+       }
+
+       if (s->ramdisk.prev) {
+               /* a flush request issued while a previous flush is still in progress
+                * will merge with the previous request. If you want the previous
+                * request to be consistent, wait for it to complete. */
+               if ((count = ramdisk_get_sectors(s->ramdisk.h, &sectors)) < 0)
+                       return count;
+
+               for (i = 0; i < count; i++) {
+                       buf = hashtable_search(s->ramdisk.h, sectors + i);
+                       ramdisk_write_hash(s->ramdisk.prev, sectors[i], buf,
+                                          s->ramdisk.sector_size);
+               }
+               free(sectors);
+
+               hashtable_destroy (s->ramdisk.h, 0);
+       } else
+               s->ramdisk.prev = s->ramdisk.h;
+
+       /* We create a new hashtable so that new writes can be performed before
+        * the old hashtable is completely drained. */
+       s->ramdisk.h = create_hashtable(RAMDISK_HASHSIZE, uint64_hash,
+                                       rd_hash_equal);
+
+       return ramdisk_flush(driver, s);
+}
+
+
+static int ramdisk_start(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       if (s->ramdisk.h) {
+               RPRINTF("ramdisk already allocated\n");
+               return 0;
+       }
+
+       s->ramdisk.sector_size = driver->info.sector_size;
+       s->ramdisk.h = create_hashtable(RAMDISK_HASHSIZE, uint64_hash,
+                                       rd_hash_equal);
+
+       DPRINTF("Ramdisk started, %zu bytes/sector\n", s->ramdisk.sector_size);
+
+       return 0;
+}
+
+/* common client/server functions */
+/* mayberead: Time out after a certain interval. */
+static int mread(int fd, void* buf, size_t len)
+{
+       fd_set rfds;
+       int rc;
+       size_t cur = 0;
+       struct timeval tv = {
+               .tv_sec = HEARTBEAT_MS / 1000,
+               .tv_usec = (HEARTBEAT_MS % 1000) * 1000
+       };
+
+       if (!len)
+               return 0;
+
+       /* read first. Only select if read is incomplete. */
+       rc = read(fd, buf, len);
+       while (rc < 0 || cur + rc < len) {
+               if (!rc) {
+                       RPRINTF("end-of-file");
+                       return -1;
+               }
+               if (rc < 0 && errno != EAGAIN) {
+                       RPRINTF("error during read: %s\n", strerror(errno));
+                       return -1;
+               }
+               if (rc > 0)
+                       cur += rc;
+
+               FD_ZERO(&rfds);
+               FD_SET(fd, &rfds);
+               if (!(rc = select(fd + 1, &rfds, NULL, NULL, &tv))) {
+                       RPRINTF("time out during read\n");
+                       return -1;
+               } else if (rc < 0) {
+                       RPRINTF("error during select: %d\n", errno);
+                       return -1;
+               }
+               rc = read(fd, buf + cur, len - cur);
+       }
+       /*
+         RPRINTF("read %d bytes\n", cur + rc);
+       */
+
+       return 0;
+}
+
+static int mwrite(int fd, void* buf, size_t len)
+{
+       fd_set wfds;
+       size_t cur = 0;
+       int rc;
+       struct timeval tv = {
+               .tv_sec = HEARTBEAT_MS / 1000,
+               .tv_usec = (HEARTBEAT_MS % 1000) * 1000
+       };
+
+       if (!len)
+               return 0;
+
+       /* read first. Only select if read is incomplete. */
+       rc = write(fd, buf, len);
+       while (rc < 0 || cur + rc < len) {
+               if (!rc) {
+                       RPRINTF("end-of-file");
+                       return -1;
+               }
+               if (rc < 0 && errno != EAGAIN) {
+                       RPRINTF("error during write: %s\n", strerror(errno));
+                       return -1;
+               }
+               if (rc > 0)
+                       cur += rc;
+
+               FD_ZERO(&wfds);
+               FD_SET(fd, &wfds);
+               if (!(rc = select(fd + 1, NULL, &wfds, NULL, &tv))) {
+                       RPRINTF("time out during write\n");
+                       return -1;
+               } else if (rc < 0) {
+                       RPRINTF("error during select: %d\n", errno);
+                       return -1;
+               }
+               rc = write(fd, buf + cur, len - cur);
+       }
+       /*
+         RPRINTF("wrote %d bytes\n", cur + rc);
+       */
+
+       return 0;
+       FD_ZERO(&wfds);
+       FD_SET(fd, &wfds);
+       select(fd + 1, NULL, &wfds, NULL, &tv);
+}
+
+
+static void inline close_stream_fd(struct tdremus_state *s)
+{
+       /* XXX: -2 is magic. replace with macro perhaps? */
+       tapdisk_server_unregister_event(s->stream_fd.id);
+       close(s->stream_fd.fd);
+       s->stream_fd.fd = -2;
+}
+
+/* primary functions */
+static void remus_client_event(event_id_t, char mode, void *private);
+static void remus_connect_event(event_id_t id, char mode, void *private);
+static void remus_retry_connect_event(event_id_t id, char mode, void *private);
+
+static int primary_do_connect(struct tdremus_state *state)
+{
+       event_id_t id;
+       int fd;
+       int rc;
+       int flags;
+
+       RPRINTF("client connecting to %s:%d...\n", inet_ntoa(state->sa.sin_addr), ntohs(state->sa.sin_port));
+
+       if ((fd = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
+               RPRINTF("could not create client socket: %d\n", errno);
+               return -1;
+       }
+
+       /* make socket nonblocking */
+       if ((flags = fcntl(fd, F_GETFL, 0)) == -1)
+               flags = 0;
+       if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1)
+               return -1;
+
+       /* once we have created the socket and populated the address, we can now start
+        * our non-blocking connect. rather than duplicating code we trigger a timeout
+        * on the socket fd, which calls out nonblocking connect code
+        */
+       if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, fd, 0, remus_retry_connect_event, state)) < 0) {
+               RPRINTF("error registering timeout client connection event handler: %s\n", strerror(id));
+               /* TODO: we leak a fd here */
+               return -1;
+       }
+       state->stream_fd.fd = fd;
+       state->stream_fd.id = id;
+       return 0;
+}
+
+static int primary_blocking_connect(struct tdremus_state *state)
+{
+       int fd;
+       int id;
+       int rc;
+       int flags;
+
+       RPRINTF("client connecting to %s:%d...\n", inet_ntoa(state->sa.sin_addr), ntohs(state->sa.sin_port));
+
+       if ((fd = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
+               RPRINTF("could not create client socket: %d\n", errno);
+               return -1;
+       }
+
+       do {
+               if ((rc = connect(fd, (struct sockaddr *)&state->sa,
+                   sizeof(state->sa))) < 0)
+               {
+                       if (errno == ECONNREFUSED) {
+                               RPRINTF("connection refused -- retrying in 1 second\n");
+                               sleep(1);
+                       } else {
+                               RPRINTF("connection failed: %d\n", errno);
+                               close(fd);
+                               return -1;
+                       }
+               }
+       } while (rc < 0);
+
+       RPRINTF("client connected\n");
+
+       /* make socket nonblocking */
+       if ((flags = fcntl(fd, F_GETFL, 0)) == -1)
+               flags = 0;
+       if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1)
+       {
+               RPRINTF("error making socket nonblocking\n");
+               close(fd);
+               return -1;
+       }
+
+       if((id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, fd, 0, remus_client_event, state)) < 0) {
+               RPRINTF("error registering client event handler: %s\n", strerror(id));
+               close(fd);
+               return -1;
+       }
+
+       state->stream_fd.fd = fd;
+       state->stream_fd.id = id;
+       return 0;
+}
+
+/* on read, just pass request through */
+static void primary_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       /* just pass read through */
+       td_forward_request(treq);
+}
+
+/* TODO:
+ * The primary uses mwrite() to write the contents of a write request to the
+ * backup. This effectively blocks until all data has been copied into a system
+ * buffer or a timeout has occured. We may wish to instead use tapdisk's
+ * nonblocking i/o interface, tapdisk_server_register_event(), to set timeouts
+ * and write data in an asynchronous fashion.
+ */
+static void primary_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       char header[sizeof(uint32_t) + sizeof(uint64_t)];
+       uint32_t *sectors = (uint32_t *)header;
+       uint64_t *sector = (uint64_t *)(header + sizeof(uint32_t));
+
+       // RPRINTF("write: stream_fd.fd: %d\n", s->stream_fd.fd);
+
+       /* -1 means we haven't connected yet, -2 means the connection was lost */
+       if(s->stream_fd.fd == -1) {
+               RPRINTF("connecting to backup...\n");
+               primary_blocking_connect(s);
+       }
+
+       *sectors = treq.secs;
+       *sector = treq.sec;
+
+       if (mwrite(s->stream_fd.fd, TDREMUS_WRITE, strlen(TDREMUS_WRITE)) < 0)
+               goto fail;
+       if (mwrite(s->stream_fd.fd, header, sizeof(header)) < 0)
+               goto fail;
+
+       if (mwrite(s->stream_fd.fd, treq.buf, treq.secs * driver->info.sector_size) < 0)
+               goto fail;
+
+       td_forward_request(treq);
+
+       return;
+
+ fail:
+       /* switch to unprotected mode and tell tapdisk to retry */
+       RPRINTF("write request replication failed, switching to unprotected mode");
+       switch_mode(s->tdremus_driver, mode_unprotected);
+       td_complete_request(treq, -EBUSY);
+}
+
+
+static int client_flush(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       // RPRINTF("committing output\n");
+
+       if (s->stream_fd.fd == -1)
+               /* connection not yet established, nothing to flush */
+               return 0;
+
+       if (mwrite(s->stream_fd.fd, TDREMUS_COMMIT, strlen(TDREMUS_COMMIT)) < 0) {
+               RPRINTF("error flushing output");
+               close_stream_fd(s);
+               return -1;
+       }
+
+       return 0;
+}
+
+static int server_flush(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+       /* 
+        * Nothing to flush in beginning.
+        */
+       if (!s->ramdisk.prev)
+               return 0;
+       /* Try to flush any remaining requests */
+       return ramdisk_flush(driver, s);        
+}
+
+static int primary_start(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       RPRINTF("activating client mode\n");
+
+       tapdisk_remus.td_queue_read = primary_queue_read;
+       tapdisk_remus.td_queue_write = primary_queue_write;
+       s->queue_flush = client_flush;
+
+       s->stream_fd.fd = -1;
+       s->stream_fd.id = -1;
+
+       return 0;
+}
+
+/* timeout callback */
+static void remus_retry_connect_event(event_id_t id, char mode, void *private)
+{
+       struct tdremus_state *s = (struct tdremus_state *)private;
+
+       /* do a non-blocking connect */
+       if (connect(s->stream_fd.fd, (struct sockaddr *)&s->sa, sizeof(s->sa))
+           && errno != EINPROGRESS)
+       {
+               if(errno == ECONNREFUSED || errno == ENETUNREACH || errno == EAGAIN || errno == ECONNABORTED)
+               {
+                       /* try again in a second */
+                       tapdisk_server_unregister_event(s->stream_fd.id);
+                       if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, s->stream_fd.fd, REMUS_CONNRETRY_TIMEOUT, remus_retry_connect_event, s)) < 0) {
+                               RPRINTF("error registering timeout client connection event handler: %s\n", strerror(id));
+                               return;
+                       }
+                       s->stream_fd.id = id;
+               }
+               else
+               {
+                       /* not recoverable */
+                       RPRINTF("error connection to server %s\n", strerror(errno));
+                       return;
+               }
+       }
+       else
+       {
+               /* the connect returned EINPROGRESS (nonblocking connect) we must wait for the fd to be writeable to determine if the connect worked */
+
+               tapdisk_server_unregister_event(s->stream_fd.id);
+               if((id = tapdisk_server_register_event(SCHEDULER_POLL_WRITE_FD, s->stream_fd.fd, 0, remus_connect_event, s)) < 0) {
+                       RPRINTF("error registering client connection event handler: %s\n", strerror(id));
+                       return;
+               }
+               s->stream_fd.id = id;
+       }
+}
+
+/* callback when nonblocking connect() is finished */
+/* called only by primary in unprotected state */
+static void remus_connect_event(event_id_t id, char mode, void *private)
+{
+       int socket_errno;
+       socklen_t socket_errno_size;
+       struct tdremus_state *s = (struct tdremus_state *)private;
+
+       /* check to se if the connect succeeded */
+       socket_errno_size = sizeof(socket_errno);
+       if (getsockopt(s->stream_fd.fd, SOL_SOCKET, SO_ERROR, &socket_errno, &socket_errno_size)) {
+               RPRINTF("error getting socket errno\n");
+               return;
+       }
+
+       RPRINTF("socket connect returned %d\n", socket_errno);
+
+       if(socket_errno)
+       {
+               /* the connect did not succeed */
+
+               if(socket_errno == ECONNREFUSED || socket_errno == ENETUNREACH || socket_errno == ETIMEDOUT
+                  || socket_errno == ECONNABORTED || socket_errno == EAGAIN)
+               {
+                       /* we can probably assume that the backup is down. just try again later */
+                       tapdisk_server_unregister_event(s->stream_fd.id);
+                       if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, s->stream_fd.fd, REMUS_CONNRETRY_TIMEOUT, remus_retry_connect_event, s)) < 0) {
+                               RPRINTF("error registering timeout client connection event handler: %s\n", strerror(id));
+                               return;
+                       }
+                       s->stream_fd.id = id;
+               }
+               else
+               {
+                       RPRINTF("socket connect returned %d, giving up\n", socket_errno);
+               }
+       }
+       else
+       {
+               /* the connect succeeded */
+
+               /* unregister this function and register a new event handler */
+               tapdisk_server_unregister_event(s->stream_fd.id);
+               if((id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, s->stream_fd.fd, 0, remus_client_event, s)) < 0) {
+                       RPRINTF("error registering client event handler: %s\n", strerror(id));
+                       return;
+               }
+               s->stream_fd.id = id;
+
+               /* switch from unprotected to protected client */
+               switch_mode(s->tdremus_driver, mode_primary);
+       }
+}
+
+
+/* we install this event handler on the primary once we have connected to the backup */
+/* wait for "done" message to commit checkpoint */
+static void remus_client_event(event_id_t id, char mode, void *private)
+{
+       struct tdremus_state *s = (struct tdremus_state *)private;
+       char req[5];
+       int rc;
+
+       if (mread(s->stream_fd.fd, req, sizeof(req) - 1) < 0) {
+               /* replication stream closed or otherwise broken (timeout, reset, &c) */
+               RPRINTF("error reading from backup\n");
+               close_stream_fd(s);
+               return;
+       }
+
+       req[4] = '\0';
+
+       if (!strcmp(req, TDREMUS_DONE))
+               /* checkpoint committed, inform msg_fd */
+               ctl_respond(s, TDREMUS_DONE);
+       else {
+               RPRINTF("received unknown message: %s\n", req);
+               close_stream_fd(s);
+       }
+
+       return;
+}
+
+/* backup functions */
+static void remus_server_event(event_id_t id, char mode, void *private);
+
+/* returns the socket that receives write requests */
+static void remus_server_accept(event_id_t id, char mode, void* private)
+{
+       struct tdremus_state* s = (struct tdremus_state *) private;
+
+       int stream_fd;
+       event_id_t cid;
+
+       /* XXX: add address-based black/white list */
+       if ((stream_fd = accept(s->server_fd.fd, NULL, NULL)) < 0) {
+               RPRINTF("error accepting connection: %d\n", errno);
+               return;
+       }
+
+       /* TODO: check to see if we are already replicating. if so just close the
+        * connection (or do something smarter) */
+       RPRINTF("server accepted connection\n");
+
+       /* add tapdisk event for replication stream */
+       cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, stream_fd, 0,
+                                           remus_server_event, s);
+
+       if(cid < 0) {
+               RPRINTF("error registering connection event handler: %s\n", strerror(errno));
+               close(stream_fd);
+               return;
+       }
+
+       /* store replication file descriptor */
+       s->stream_fd.fd = stream_fd;
+       s->stream_fd.id = cid;
+}
+
+/* returns -2 if EADDRNOTAVAIL */
+static int remus_bind(struct tdremus_state* s)
+{
+//  struct sockaddr_in sa;
+       int opt;
+       int rc = -1;
+
+       if ((s->server_fd.fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+               RPRINTF("could not create server socket: %d\n", errno);
+               return rc;
+       }
+       opt = 1;
+       if (setsockopt(s->server_fd.fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0)
+               RPRINTF("Error setting REUSEADDR on %d: %d\n", s->server_fd.fd, errno);
+
+       if (bind(s->server_fd.fd, (struct sockaddr *)&s->sa, sizeof(s->sa)) < 0) {
+               RPRINTF("could not bind server socket %d to %s:%d: %d %s\n", s->server_fd.fd,
+                       inet_ntoa(s->sa.sin_addr), ntohs(s->sa.sin_port), errno, strerror(errno));
+               if (errno != EADDRINUSE)
+                       rc = -2;
+               goto err_sfd;
+       }
+       if (listen(s->server_fd.fd, 10)) {
+               RPRINTF("could not listen on socket: %d\n", errno);
+               goto err_sfd;
+       }
+
+       /* The socket s now bound to the address and listening so we may now register
+   * the fd with tapdisk */
+
+       if((s->server_fd.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                                           s->server_fd.fd, 0,
+                                                           remus_server_accept, s)) < 0) {
+               RPRINTF("error registering server connection event handler: %s",
+                       strerror(s->server_fd.id));
+               goto err_sfd;
+       }
+
+       return 0;
+
+ err_sfd:
+       close(s->server_fd.fd);
+       s->server_fd.fd = -1;
+
+       return rc;
+}
+
+/* wait for latest checkpoint to be applied */
+static inline int server_writes_inflight(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       if (!s->ramdisk.inflight && !s->ramdisk.prev)
+               return 0;
+
+       return 1;
+}
+
+/* Due to block device prefetching this code may be called on the server side
+ * during normal replication. In this case we must return EBUSY, otherwise the
+ * domain may be started with stale data.
+ */
+void backup_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+       int i;
+       if(!remus_image)
+               remus_image = treq.image;
+       
+       /* check if this read is queued in any currently ongoing flush */
+       if (ramdisk_read(&s->ramdisk, treq.sec, treq.secs, treq.buf)) {
+               /* TODO: Add to pending read hash */
+               td_forward_request(treq);
+       } else {
+               /* complete the request */
+               td_complete_request(treq, 0);
+       }
+}
+
+/* see above */
+void backup_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       /* on a server write, we know the domain has failed over. we must change our
+        * state to unprotected and then have the unprotected queue_write function
+        * handle the write
+        */
+
+       switch_mode(driver, mode_unprotected);
+       /* TODO: call the appropriate write function rather than return EBUSY */
+       td_complete_request(treq, -EBUSY);
+}
+
+static int backup_start(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+       int fd;
+
+       if (ramdisk_start(driver) < 0)
+               return -1;
+
+       tapdisk_remus.td_queue_read = backup_queue_read;
+       tapdisk_remus.td_queue_write = backup_queue_write;
+       s->queue_flush = server_flush;
+       /* TODO set flush function */
+       return 0;
+}
+
+static int server_do_wreq(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+       static tdremus_wire_t twreq;
+       char buf[4096];
+       int len, rc;
+
+       char header[sizeof(uint32_t) + sizeof(uint64_t)];
+       uint32_t *sectors = (uint32_t *) header;
+       uint64_t *sector =  (uint64_t *) &header[sizeof(uint32_t)];
+
+       // RPRINTF("received write request\n");
+
+       if (mread(s->stream_fd.fd, header, sizeof(header)) < 0)
+               goto err;
+
+       len = *sectors * driver->info.sector_size;
+
+       //RPRINTF("writing %d sectors (%d bytes) starting at %" PRIu64 "\n", *sectors, len,
+       // *sector);
+
+       if (len > sizeof(buf)) {
+               /* freak out! */
+               RPRINTF("write request too large: %d/%u\n", len, (unsigned)sizeof(buf));
+               return -1;
+       }
+
+       if (mread(s->stream_fd.fd, buf, len) < 0)
+               goto err;
+
+       if (ramdisk_write(&s->ramdisk, *sector, *sectors, buf) < 0)
+               goto err;
+
+       return 0;
+
+ err:
+       /* should start failover */
+       RPRINTF("backup write request error\n");
+       close_stream_fd(s);
+
+       return -1;
+}
+
+static int server_do_sreq(td_driver_t *driver)
+{
+       /*
+         RPRINTF("submit request received\n");
+  */
+
+       return 0;
+}
+
+/* at this point, the server can start applying the most recent
+ * ramdisk. */
+static int server_do_creq(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       // RPRINTF("committing buffer\n");
+
+       ramdisk_start_flush(driver);
+
+       /* XXX this message should not be sent until flush completes! */
+       if (write(s->stream_fd.fd, TDREMUS_DONE, strlen(TDREMUS_DONE)) != 4)
+               return -1;
+
+       return 0;
+}
+
+
+/* called when data is pending in s->rfd */
+static void remus_server_event(event_id_t id, char mode, void *private)
+{
+       struct tdremus_state *s = (struct tdremus_state *)private;
+       td_driver_t *driver = s->tdremus_driver;
+       char req[5];
+
+       // RPRINTF("replication data waiting\n");
+
+       /* TODO: add a get_connection_by_event_id() function.
+        * for now we can assume that the fd is s->stream_fd */
+
+       if (mread(s->stream_fd.fd, req, sizeof(req) - 1) < 0) {
+               RPRINTF("error reading server event, activating backup\n");
+               switch_mode(driver, mode_unprotected);
+               return;
+       }
+
+       req[4] = '\0';
+
+       if (!strcmp(req, TDREMUS_WRITE))
+               server_do_wreq(driver);
+       else if (!strcmp(req, TDREMUS_SUBMIT))
+               server_do_sreq(driver);
+       else if (!strcmp(req, TDREMUS_COMMIT))
+               server_do_creq(driver);
+       else
+               RPRINTF("unknown request received: %s\n", req);
+
+       return;
+
+}
+
+/* unprotected */
+
+void unprotected_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       /* wait for previous ramdisk to flush  before servicing reads */
+       if (server_writes_inflight(driver)) {
+               /* for now lets just return EBUSY.
+                * if there are any left-over requests in prev,
+                * kick em again.
+                */
+               if(!s->ramdisk.inflight) /* nothing in inprogress */
+                       ramdisk_flush(driver, s);
+
+               td_complete_request(treq, -EBUSY);
+       }
+       else {
+               /* here we just pass reads through */
+               td_forward_request(treq);
+       }
+}
+
+/* For a recoverable remus solution we need to log unprotected writes here */
+void unprotected_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       /* wait for previous ramdisk to flush */
+       if (server_writes_inflight(driver)) {
+               RPRINTF("queue_write: waiting for queue to drain");
+               if(!s->ramdisk.inflight) /* nothing in inprogress. Kick prev */
+                       ramdisk_flush(driver, s);
+               td_complete_request(treq, -EBUSY);
+       }
+       else {
+               // RPRINTF("servicing write request on backup\n");
+               /* NOTE: DRBD style bitmap tracking could go here */
+               td_forward_request(treq);
+       }
+}
+
+static int unprotected_start(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       RPRINTF("failure detected, activating passthrough\n");
+
+       /* close the server socket */
+       close_stream_fd(s);
+
+       /* unregister the replication stream */
+       tapdisk_server_unregister_event(s->server_fd.id);
+
+       /* close the replication stream */
+       close(s->server_fd.fd);
+       s->server_fd.fd = -1;
+
+       /* install the unprotected read/write handlers */
+       tapdisk_remus.td_queue_read = unprotected_queue_read;
+       tapdisk_remus.td_queue_write = unprotected_queue_write;
+
+       return 0;
+}
+
+
+/* control */
+
+static inline int resolve_address(const char* addr, struct in_addr* ia)
+{
+       struct hostent* he;
+       uint32_t ip;
+
+       if (!(he = gethostbyname(addr))) {
+               RPRINTF("error resolving %s: %d\n", addr, h_errno);
+               return -1;
+       }
+
+       if (!he->h_addr_list[0]) {
+               RPRINTF("no address found for %s\n", addr);
+               return -1;
+       }
+
+       /* network byte order */
+       ip = *((uint32_t**)he->h_addr_list)[0];
+       ia->s_addr = ip;
+
+       return 0;
+}
+
+static int get_args(td_driver_t *driver, const char* name)
+{
+       struct tdremus_state *state = (struct tdremus_state *)driver->data;
+       char* host;
+       char* port;
+//  char* driver_str;
+//  char* parent;
+//  int type;
+//  char* path;
+//  unsigned long ulport;
+//  int i;
+//  struct sockaddr_in server_addr_in;
+
+       int gai_status;
+       int valid_addr;
+       struct addrinfo gai_hints;
+       struct addrinfo *servinfo, *servinfo_itr;
+
+       memset(&gai_hints, 0, sizeof gai_hints);
+       gai_hints.ai_family = AF_UNSPEC;
+       gai_hints.ai_socktype = SOCK_STREAM;
+
+       port = strchr(name, ':');
+       if (!port) {
+               RPRINTF("missing host in %s\n", name);
+               return -ENOENT;
+       }
+       if (!(host = strndup(name, port - name))) {
+               RPRINTF("unable to allocate host\n");
+               return -ENOMEM;
+       }
+       port++;
+
+       if ((gai_status = getaddrinfo(host, port, &gai_hints, &servinfo)) != 0) {
+               RPRINTF("getaddrinfo error: %s\n", gai_strerror(gai_status));
+               return -ENOENT;
+       }
+
+       /* TODO: do something smarter here */
+       valid_addr = 0;
+       for(servinfo_itr = servinfo; servinfo_itr != NULL; servinfo_itr = servinfo_itr->ai_next) {
+               void *addr;
+               char *ipver;
+
+               if (servinfo_itr->ai_family == AF_INET) {
+                       valid_addr = 1;
+                       memset(&state->sa, 0, sizeof(state->sa));
+                       state->sa = *(struct sockaddr_in *)servinfo_itr->ai_addr;
+                       break;
+               }
+       }
+       freeaddrinfo(servinfo);
+
+       if (!valid_addr)
+               return -ENOENT;
+
+       RPRINTF("host: %s, port: %d\n", inet_ntoa(state->sa.sin_addr), ntohs(state->sa.sin_port));
+
+       return 0;
+}
+
+static int switch_mode(td_driver_t *driver, enum tdremus_mode mode)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+       int rc;
+
+       if (mode == s->mode)
+               return 0;
+
+       if (s->queue_flush)
+               if ((rc = s->queue_flush(driver)) < 0) {
+                       // fall back to unprotected mode on error
+                       RPRINTF("switch_mode: error flushing queue (old: %d, new: %d)", s->mode, mode);
+                       mode = mode_unprotected;
+               }
+
+       if (mode == mode_unprotected)
+               rc = unprotected_start(driver);
+       else if (mode == mode_primary)
+               rc = primary_start(driver);
+       else if (mode == mode_backup)
+               rc = backup_start(driver);
+       else {
+               RPRINTF("unknown mode requested: %d\n", mode);
+               rc = -1;
+       }
+
+       if (!rc)
+               s->mode = mode;
+
+       return rc;
+}
+
+static void ctl_request(event_id_t id, char mode, void *private)
+{
+       struct tdremus_state *s = (struct tdremus_state *)private;
+       td_driver_t *driver = s->tdremus_driver;
+       char msg[80];
+       int rc;
+
+       // RPRINTF("data waiting on control fifo\n");
+
+       if (!(rc = read(s->ctl_fd.fd, msg, sizeof(msg) - 1 /* append nul */))) {
+               RPRINTF("0-byte read received, reopening FIFO\n");
+               /*TODO: we may have to unregister/re-register with tapdisk_server */
+               close(s->ctl_fd.fd);
+               RPRINTF("FIFO closed\n");
+               if ((s->ctl_fd.fd = open(s->ctl_path, O_RDWR)) < 0) {
+                       RPRINTF("error reopening FIFO: %d\n", errno);
+               }
+               return;
+       }
+
+       if (rc < 0) {
+               RPRINTF("error reading from FIFO: %d\n", errno);
+               return;
+       }
+
+       /* TODO: need to get driver somehow */
+       msg[rc] = '\0';
+       if (!strncmp(msg, "flush", 5)) {
+               if (s->queue_flush)
+                       if ((rc = s->queue_flush(driver))) {
+                               RPRINTF("error passing flush request to backup");
+                               ctl_respond(s, TDREMUS_FAIL);
+                       }
+       } else {
+               RPRINTF("unknown command: %s\n", msg);
+       }
+}
+
+static int ctl_respond(struct tdremus_state *s, const char *response)
+{
+       int rc;
+
+       if ((rc = write(s->msg_fd.fd, response, strlen(response))) < 0) {
+               RPRINTF("error writing notification: %d\n", errno);
+               close(s->msg_fd.fd);
+               if ((s->msg_fd.fd = open(s->msg_path, O_RDWR)) < 0)
+                       RPRINTF("error reopening FIFO: %d\n", errno);
+       }
+
+       return rc;
+}
+
+/* must be called after the underlying driver has been initialized */
+static int ctl_open(td_driver_t *driver, const char* name)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+       int i, l;
+
+       /* first we must ensure that BLKTAP_CTRL_DIR exists */
+       if (mkdir(BLKTAP_CTRL_DIR, 0755) && errno != EEXIST)
+       {
+               DPRINTF("error creating directory %s: %d\n", BLKTAP_CTRL_DIR, errno);
+               return -1;
+       }
+
+       /* use the device name to create the control fifo path */
+       if (asprintf(&s->ctl_path, BLKTAP_CTRL_DIR "/remus_%s", name) < 0)
+               return -1;
+       /* scrub fifo pathname  */
+       for (i = strlen(BLKTAP_CTRL_DIR) + 1, l = strlen(s->ctl_path); i < l; i++) {
+               if (strchr(":/", s->ctl_path[i]))
+                       s->ctl_path[i] = '_';
+       }
+       if (asprintf(&s->msg_path, "%s.msg", s->ctl_path) < 0)
+               goto err_ctlfifo;
+
+       if (mkfifo(s->ctl_path, S_IRWXU|S_IRWXG|S_IRWXO) && errno != EEXIST) {
+               RPRINTF("error creating control FIFO %s: %d\n", s->ctl_path, errno);
+               goto err_msgfifo;
+       }
+
+       if (mkfifo(s->msg_path, S_IRWXU|S_IRWXG|S_IRWXO) && errno != EEXIST) {
+               RPRINTF("error creating message FIFO %s: %d\n", s->msg_path, errno);
+               goto err_msgfifo;
+       }
+
+       /* RDWR so that fd doesn't block select when no writer is present */
+       if ((s->ctl_fd.fd = open(s->ctl_path, O_RDWR)) < 0) {
+               RPRINTF("error opening control FIFO %s: %d\n", s->ctl_path, errno);
+               goto err_msgfifo;
+       }
+
+       if ((s->msg_fd.fd = open(s->msg_path, O_RDWR)) < 0) {
+               RPRINTF("error opening message FIFO %s: %d\n", s->msg_path, errno);
+               goto err_openctlfifo;
+       }
+
+       RPRINTF("control FIFO %s\n", s->ctl_path);
+       RPRINTF("message FIFO %s\n", s->msg_path);
+
+       return 0;
+
+ err_openctlfifo:
+       close(s->ctl_fd.fd);
+ err_msgfifo:
+       free(s->msg_path);
+       s->msg_path = NULL;
+ err_ctlfifo:
+       free(s->ctl_path);
+       s->ctl_path = NULL;
+       return -1;
+}
+
+static void ctl_close(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       /* TODO: close *all* connections */
+
+       if(s->ctl_fd.fd)
+               close(s->ctl_fd.fd);
+
+       if (s->ctl_path) {
+               unlink(s->ctl_path);
+               free(s->ctl_path);
+               s->ctl_path = NULL;
+       }
+       if (s->msg_path) {
+               unlink(s->msg_path);
+               free(s->msg_path);
+               s->msg_path = NULL;
+       }
+}
+
+static int ctl_register(struct tdremus_state *s)
+{
+       RPRINTF("registering ctl fifo\n");
+
+       /* register ctl fd */
+       s->ctl_fd.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, s->ctl_fd.fd, 0, ctl_request, s);
+
+       if (s->ctl_fd.id < 0) {
+               RPRINTF("error registering ctrl FIFO %s: %d\n", s->ctl_path, s->ctl_fd.id);
+               return -1;
+       }
+
+       return 0;
+}
+
+/* interface */
+
+static int tdremus_open(td_driver_t *driver, const char *name,
+                       td_flag_t flags)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+       int rc;
+
+       RPRINTF("opening %s\n", name);
+
+       /* first we need to get the underlying vbd for this driver stack. To do so we
+        * need to know the vbd's id. Fortunately, for tapdisk2 this is hard-coded as
+        * 0 (see tapdisk2.c)
+        */
+       device_vbd = tapdisk_server_get_vbd(0);
+
+       memset(s, 0, sizeof(*s));
+       s->server_fd.fd = -1;
+       s->stream_fd.fd = -1;
+       s->ctl_fd.fd = -1;
+       s->msg_fd.fd = -1;
+
+       /* TODO: this is only needed so that the server can send writes down
+        * the driver stack from the stream_fd event handler */
+       s->tdremus_driver = driver;
+
+       /* parse name to get info etc */
+       if ((rc = get_args(driver, name)))
+               return rc;
+
+       if ((rc = ctl_open(driver, name))) {
+               RPRINTF("error setting up control channel\n");
+               free(s->driver_data);
+               return rc;
+       }
+
+       if ((rc = ctl_register(s))) {
+               RPRINTF("error registering control channel\n");
+               free(s->driver_data);
+               return rc;
+       }
+
+       if (!(rc = remus_bind(s)))
+               rc = switch_mode(driver, mode_backup);
+       else if (rc == -2)
+               rc = switch_mode(driver, mode_primary);
+
+       if (!rc)
+               return 0;
+
+       tdremus_close(driver);
+       return -EIO;
+}
+
+static int tdremus_close(td_driver_t *driver)
+{
+       struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+       RPRINTF("closing\n");
+       if (s->ramdisk.inprogress)
+               hashtable_destroy(s->ramdisk.inprogress, 0);
+       
+       if (s->driver_data) {
+               free(s->driver_data);
+               s->driver_data = NULL;
+       }
+       if (s->server_fd.fd >= 0) {
+               close(s->server_fd.fd);
+               s->server_fd.fd = -1;
+       }
+       if (s->stream_fd.fd >= 0)
+               close_stream_fd(s);
+
+       ctl_close(driver);
+
+       return 0;
+}
+
+static int tdremus_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       /* we shouldn't have a parent... for now */
+       return -EINVAL;
+}
+
+static int tdremus_validate_parent(td_driver_t *driver,
+                                  td_driver_t *pdriver, td_flag_t flags)
+{
+       return 0;
+}
+
+struct tap_disk tapdisk_remus = {
+       .disk_type          = "tapdisk_remus",
+       .private_data_size  = sizeof(struct tdremus_state),
+       .td_open            = tdremus_open,
+       .td_queue_read      = unprotected_queue_read,
+       .td_queue_write     = unprotected_queue_write,
+       .td_close           = tdremus_close,
+       .td_get_parent_id   = tdremus_get_parent_id,
+       .td_validate_parent = tdremus_validate_parent,
+       .td_debug           = NULL,
+};
diff --git a/tools/blktap2/drivers/block-vhd.c b/tools/blktap2/drivers/block-vhd.c
new file mode 100644 (file)
index 0000000..f7853f9
--- /dev/null
@@ -0,0 +1,2322 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * A note on write transactions:
+ * Writes that require updating the BAT or bitmaps cannot be signaled
+ * as complete until all updates have reached disk.  Transactions are
+ * used to ensure proper ordering in these cases.  The two types of
+ * transactions are as follows:
+ *   - Bitmap updates only: data writes that require updates to the same
+ *     bitmap are grouped in a transaction.  Only after all data writes
+ *     in a transaction complete does the bitmap write commence.  Only
+ *     after the bitmap write finishes are the data writes signalled as
+ *     complete.
+ *   - BAT and bitmap updates: data writes are grouped in transactions
+ *     as above, but a special extra write is included in the transaction,
+ *     which zeros out the newly allocated bitmap on disk.  When the data
+ *     writes and the zero-bitmap write complete, the BAT and bitmap writes
+ *     are started in parallel.  The transaction is completed only after both
+ *     the BAT and bitmap writes successfully return.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>    /* for memset.                                 */
+#include <libaio.h>
+#include <sys/mman.h>
+
+#include "libvhd.h"
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+
+unsigned int SPB;
+
+#define DEBUGGING   2
+#define ASSERTING   1
+#define MICROSOFT_COMPAT
+
+#define VHD_BATMAP_MAX_RETRIES 10
+
+#define __TRACE(s)                                                     \
+       do {                                                            \
+               DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %"   \
+                   PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: "   \
+                   "%lu, BBLK: 0x%04x\n",                              \
+                   s->vhd.file, s->queued, s->completed, s->returned,  \
+                   VHD_REQS_DATA - s->vreq_free_count,                 \
+                   s->bat.pbw_blk);                                    \
+       } while(0)
+
+#define __ASSERT(_p)                                                   \
+       if (!(_p)) {                                                    \
+               DPRINTF("%s:%d: FAILED ASSERTION: '%s'\n",              \
+                       __FILE__, __LINE__, #_p);                       \
+               DBG(TLOG_WARN, "%s:%d: FAILED ASSERTION: '%s'\n",       \
+                   __FILE__, __LINE__, #_p);                           \
+               tlog_flush();                                           \
+               abort();                                                \
+       }
+
+#if (DEBUGGING == 1)
+  #define DBG(level, _f, _a...)      DPRINTF(_f, ##_a)
+  #define ERR(err, _f, _a...)        DPRINTF("ERROR: %d: " _f, err, ##_a)
+  #define TRACE(s)                   ((void)0)
+#elif (DEBUGGING == 2)
+  #define DBG(level, _f, _a...)      tlog_write(level, _f, ##_a)
+  #define ERR(err, _f, _a...)       tlog_error(err, _f, ##_a)
+  #define TRACE(s)                   __TRACE(s)
+#else
+  #define DBG(level, _f, _a...)      ((void)0)
+  #define ERR(err, _f, _a...)        ((void)0)
+  #define TRACE(s)                   ((void)0)
+#endif
+
+#if (ASSERTING == 1)
+  #define ASSERT(_p)                 __ASSERT(_p)
+#else
+  #define ASSERT(_p)                 ((void)0)
+#endif
+
+/******VHD DEFINES******/
+#define VHD_CACHE_SIZE               32
+
+#define VHD_REQS_DATA                TAPDISK_DATA_REQUESTS
+#define VHD_REQS_META                (VHD_CACHE_SIZE + 2)
+#define VHD_REQS_TOTAL               (VHD_REQS_DATA + VHD_REQS_META)
+
+#define VHD_OP_BAT_WRITE             0
+#define VHD_OP_DATA_READ             1
+#define VHD_OP_DATA_WRITE            2
+#define VHD_OP_BITMAP_READ           3
+#define VHD_OP_BITMAP_WRITE          4
+#define VHD_OP_ZERO_BM_WRITE         5
+
+#define VHD_BM_BAT_LOCKED            0
+#define VHD_BM_BAT_CLEAR             1
+#define VHD_BM_BIT_CLEAR             2
+#define VHD_BM_BIT_SET               3
+#define VHD_BM_NOT_CACHED            4
+#define VHD_BM_READ_PENDING          5
+
+#define VHD_FLAG_OPEN_RDONLY         1
+#define VHD_FLAG_OPEN_NO_CACHE       2
+#define VHD_FLAG_OPEN_QUIET          4
+#define VHD_FLAG_OPEN_STRICT         8
+#define VHD_FLAG_OPEN_QUERY          16
+#define VHD_FLAG_OPEN_PREALLOCATE    32
+
+#define VHD_FLAG_BAT_LOCKED          1
+#define VHD_FLAG_BAT_WRITE_STARTED   2
+
+#define VHD_FLAG_BM_UPDATE_BAT       1
+#define VHD_FLAG_BM_WRITE_PENDING    2
+#define VHD_FLAG_BM_READ_PENDING     4
+#define VHD_FLAG_BM_LOCKED           8
+
+#define VHD_FLAG_REQ_UPDATE_BAT      1
+#define VHD_FLAG_REQ_UPDATE_BITMAP   2
+#define VHD_FLAG_REQ_QUEUED          4
+#define VHD_FLAG_REQ_FINISHED        8
+
+#define VHD_FLAG_TX_LIVE             1
+#define VHD_FLAG_TX_UPDATE_BAT       2
+
+typedef uint8_t vhd_flag_t;
+
+struct vhd_state;
+struct vhd_request;
+
+struct vhd_req_list {
+       struct vhd_request       *head;
+       struct vhd_request       *tail;
+};
+
+struct vhd_transaction {
+       int                       error;
+       int                       closed;
+       int                       started;
+       int                       finished;
+       vhd_flag_t                status;
+       struct vhd_req_list       requests;
+};
+
+struct vhd_request {
+       int                       error;
+       uint8_t                   op;
+       vhd_flag_t                flags;
+       td_request_t              treq;
+       struct tiocb              tiocb;
+       struct vhd_state         *state;
+       struct vhd_request       *next;
+       struct vhd_transaction   *tx;
+};
+
+struct vhd_bat_state {
+       vhd_bat_t                 bat;
+       vhd_batmap_t              batmap;
+       vhd_flag_t                status;
+       uint32_t                  pbw_blk;     /* blk num of pending write */
+       uint64_t                  pbw_offset;  /* file offset of same */
+       struct vhd_request        req;         /* for writing bat table */
+       struct vhd_request        zero_req;    /* for initializing bitmaps */
+       char                     *bat_buf;
+};
+
+struct vhd_bitmap {
+       u32                       blk;
+       u64                       seqno;       /* lru sequence number */
+       vhd_flag_t                status;
+
+       char                     *map;         /* map should only be modified
+                                               * in finish_bitmap_write */
+       char                     *shadow;      /* in-memory bitmap changes are 
+                                               * made to shadow and copied to
+                                               * map only after having been
+                                               * flushed to disk */
+       struct vhd_transaction    tx;          /* transaction data structure
+                                               * encapsulating data, bitmap, 
+                                               * and bat writes */
+       struct vhd_req_list       queue;       /* data writes waiting for next
+                                               * transaction */
+       struct vhd_req_list       waiting;     /* pending requests that cannot
+                                               * be serviced until this bitmap
+                                               * is read from disk */
+       struct vhd_request        req;
+};
+
+struct vhd_state {
+       vhd_flag_t                flags;
+
+        /* VHD stuff */
+       vhd_context_t             vhd;
+       u32                       spp;         /* sectors per page */
+        u32                       spb;         /* sectors per block */
+        u64                       next_db;     /* pointer to the next 
+                                               * (unallocated) datablock */
+
+       struct vhd_bat_state      bat;
+
+       u64                       bm_lru;      /* lru sequence number */
+       u32                       bm_secs;     /* size of bitmap, in sectors */
+       struct vhd_bitmap        *bitmap[VHD_CACHE_SIZE];
+
+       int                       bm_free_count;
+       struct vhd_bitmap        *bitmap_free[VHD_CACHE_SIZE];
+       struct vhd_bitmap         bitmap_list[VHD_CACHE_SIZE];
+
+       int                       vreq_free_count;
+       struct vhd_request       *vreq_free[VHD_REQS_DATA];
+       struct vhd_request        vreq_list[VHD_REQS_DATA];
+
+       td_driver_t              *driver;
+
+       uint64_t                  queued;
+       uint64_t                  completed;
+       uint64_t                  returned;
+       uint64_t                  reads;
+       uint64_t                  read_size;
+       uint64_t                  writes;
+       uint64_t                  write_size;
+};
+
+#define test_vhd_flag(word, flag)  ((word) & (flag))
+#define set_vhd_flag(word, flag)   ((word) |= (flag))
+#define clear_vhd_flag(word, flag) ((word) &= ~(flag))
+
+#define bat_entry(s, blk)          ((s)->bat.bat.bat[(blk)])
+
+static void vhd_complete(void *, struct tiocb *, int);
+static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *);
+
+static struct vhd_state  *_vhd_master;
+static unsigned long      _vhd_zsize;
+static char              *_vhd_zeros;
+
+static int
+vhd_initialize(struct vhd_state *s)
+{
+       if (_vhd_zeros)
+               return 0;
+
+       _vhd_zsize = 2 * getpagesize();
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
+               _vhd_zsize += VHD_BLOCK_SIZE;
+
+       _vhd_zeros = mmap(0, _vhd_zsize, PROT_READ,
+                         MAP_SHARED | MAP_ANON, -1, 0);
+       if (_vhd_zeros == MAP_FAILED) {
+               EPRINTF("vhd_initialize failed: %d\n", -errno);
+               _vhd_zeros = NULL;
+               _vhd_zsize = 0;
+               return -errno;
+       }
+
+       _vhd_master = s;
+       return 0;
+}
+
+static void
+vhd_free(struct vhd_state *s)
+{
+       if (_vhd_master != s || !_vhd_zeros)
+               return;
+
+       munmap(_vhd_zeros, _vhd_zsize);
+       _vhd_zsize  = 0;
+       _vhd_zeros  = NULL;
+       _vhd_master = NULL;
+}
+
+static char *
+_get_vhd_zeros(const char *func, unsigned long size)
+{
+       if (!_vhd_zeros || _vhd_zsize < size) {
+               EPRINTF("invalid zero request from %s: %lu, %lu, %p\n",
+                       func, size, _vhd_zsize, _vhd_zeros);
+               ASSERT(0);
+       }
+
+       return _vhd_zeros;
+}
+
+#define vhd_zeros(size)        _get_vhd_zeros(__func__, size)
+
+static inline void
+set_batmap(struct vhd_state *s, uint32_t blk)
+{
+       if (s->bat.batmap.map) {
+               vhd_batmap_set(&s->vhd, &s->bat.batmap, blk);
+               DBG(TLOG_DBG, "block 0x%x completely full\n", blk);
+       }
+}
+
+static inline int
+test_batmap(struct vhd_state *s, uint32_t blk)
+{
+       if (!s->bat.batmap.map)
+               return 0;
+       return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk);
+}
+
+static int
+vhd_kill_footer(struct vhd_state *s)
+{
+       int err;
+       off_t end;
+       char *zeros;
+
+       if (s->vhd.footer.type == HD_TYPE_FIXED)
+               return 0;
+
+       err = posix_memalign((void **)&zeros, 512, 512);
+       if (err)
+               return -err;
+
+       err = 1;
+       memset(zeros, 0xc7c7c7c7, 512);
+
+       if ((end = lseek(s->vhd.fd, 0, SEEK_END)) == -1)
+               goto fail;
+
+       if (lseek(s->vhd.fd, (end - 512), SEEK_SET) == -1)
+               goto fail;
+
+       if (write(s->vhd.fd, zeros, 512) != 512)
+               goto fail;
+
+       err = 0;
+
+ fail:
+       free(zeros);
+       if (err)
+               return (errno ? -errno : -EIO);
+       return 0;
+}
+
+static inline int
+find_next_free_block(struct vhd_state *s)
+{
+       int err;
+       off_t eom;
+       uint32_t i, entry;
+
+       err = vhd_end_of_headers(&s->vhd, &eom);
+       if (err)
+               return err;
+
+       s->next_db = secs_round_up(eom);
+
+       for (i = 0; i < s->bat.bat.entries; i++) {
+               entry = bat_entry(s, i);
+               if (entry != DD_BLK_UNUSED && entry >= s->next_db)
+                       s->next_db = entry + s->spb + s->bm_secs;
+       }
+
+       return 0;
+}
+
+static void
+vhd_free_bat(struct vhd_state *s)
+{
+       free(s->bat.bat.bat);
+       free(s->bat.batmap.map);
+       free(s->bat.bat_buf);
+       memset(&s->bat, 0, sizeof(struct vhd_bat));
+}
+
+static int
+vhd_initialize_bat(struct vhd_state *s)
+{
+       int err, psize, batmap_required, i;
+
+       memset(&s->bat, 0, sizeof(struct vhd_bat));
+
+       psize = getpagesize();
+
+       err = vhd_read_bat(&s->vhd, &s->bat.bat);
+       if (err) {
+               EPRINTF("%s: reading bat: %d\n", s->vhd.file, err);
+               return err;
+       }
+
+       batmap_required = 1;
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) {
+               batmap_required = 0;
+       } else {
+               err = find_next_free_block(s);
+               if (err)
+                       goto fail;
+       }
+
+       if (vhd_has_batmap(&s->vhd)) {
+               for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) {
+                       err = vhd_read_batmap(&s->vhd, &s->bat.batmap);
+                       if (err) {
+                               EPRINTF("%s: reading batmap: %d\n",
+                                               s->vhd.file, err);
+                               if (batmap_required)
+                                       goto fail;
+                       } else {
+                               break;
+                       }
+               }
+               if (err)
+                       EPRINTF("%s: ignoring non-critical batmap error\n",
+                                       s->vhd.file);
+       }
+
+       err = posix_memalign((void **)&s->bat.bat_buf,
+                            VHD_SECTOR_SIZE, VHD_SECTOR_SIZE);
+       if (err) {
+               s->bat.bat_buf = NULL;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       vhd_free_bat(s);
+       return err;
+}
+
+static void
+vhd_free_bitmap_cache(struct vhd_state *s)
+{
+       int i;
+       struct vhd_bitmap *bm;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap_list + i;
+               free(bm->map);
+               free(bm->shadow);
+               s->bitmap_free[i] = NULL;
+       }
+
+       memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
+}
+
+static int
+vhd_initialize_bitmap_cache(struct vhd_state *s)
+{
+       int i, err, map_size;
+       struct vhd_bitmap *bm;
+
+       memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
+
+       s->bm_lru        = 0;
+       map_size         = vhd_sectors_to_bytes(s->bm_secs);
+       s->bm_free_count = VHD_CACHE_SIZE;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap_list + i;
+
+               err = posix_memalign((void **)&bm->map, 512, map_size);
+               if (err) {
+                       bm->map = NULL;
+                       goto fail;
+               }
+
+               err = posix_memalign((void **)&bm->shadow, 512, map_size);
+               if (err) {
+                       bm->shadow = NULL;
+                       goto fail;
+               }
+
+               memset(bm->map, 0, map_size);
+               memset(bm->shadow, 0, map_size);
+               s->bitmap_free[i] = bm;
+       }
+
+       return 0;
+
+fail:
+       vhd_free_bitmap_cache(s);
+       return err;
+}
+
+static int
+vhd_initialize_dynamic_disk(struct vhd_state *s)
+{
+       int err;
+
+       err = vhd_get_header(&s->vhd);
+       if (err) {
+               if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+                       EPRINTF("Error reading VHD DD header.\n");
+               return err;
+       }
+
+       if (s->vhd.header.hdr_ver != 0x00010000) {
+               EPRINTF("unsupported header version! (0x%x)\n",
+                       s->vhd.header.hdr_ver);
+               return -EINVAL;
+       }
+
+       s->spp     = getpagesize() >> VHD_SECTOR_SHIFT;
+       s->spb     = s->vhd.header.block_size >> VHD_SECTOR_SHIFT;
+       s->bm_secs = secs_round_up_no_zero(s->spb >> 3);
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE))
+               return 0;
+
+       err = vhd_initialize_bat(s);
+       if (err)
+               return err;
+
+       err = vhd_initialize_bitmap_cache(s);
+       if (err) {
+               vhd_free_bat(s);
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_check_version(struct vhd_state *s)
+{
+       if (strncmp(s->vhd.footer.crtr_app, "tap", 3))
+               return 0;
+
+       if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) {
+               if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+                       EPRINTF("WARNING: %s vhd creator version 0x%08x, "
+                               "but only versions up to 0x%08x are "
+                               "supported for IO\n", s->vhd.file,
+                               s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION);
+
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void
+vhd_log_open(struct vhd_state *s)
+{
+       char buf[5];
+       uint32_t i, allocated, full;
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+               return;
+
+       snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app);
+       if (!vhd_type_dynamic(&s->vhd)) {
+               DPRINTF("%s version: %s 0x%08x\n",
+                       s->vhd.file, buf, s->vhd.footer.crtr_ver);
+               return;
+       }
+
+       allocated = 0;
+       full      = 0;
+
+       for (i = 0; i < s->bat.bat.entries; i++) {
+               if (bat_entry(s, i) != DD_BLK_UNUSED)
+                       allocated++;
+               if (test_batmap(s, i))
+                       full++;
+       }
+
+       DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n",
+               s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries,
+               allocated, full, s->next_db);
+}
+
+static int
+__vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags)
+{
+        int i, o_flags, err;
+       struct vhd_state *s;
+
+        DBG(TLOG_INFO, "vhd_open: %s\n", name);
+       if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT))
+               libvhd_set_log_level(1);
+
+       s = (struct vhd_state *)driver->data;
+       memset(s, 0, sizeof(struct vhd_state));
+
+       s->flags  = flags;
+       s->driver = driver;
+
+       err = vhd_initialize(s);
+       if (err)
+               return err;
+
+       o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ? 
+                  VHD_OPEN_RDONLY : VHD_OPEN_RDWR);
+
+       err = vhd_open(&s->vhd, name, o_flags);
+       if (err) {
+               libvhd_set_log_level(1);
+               err = vhd_open(&s->vhd, name, o_flags);
+               if (err) {
+                       EPRINTF("Unable to open [%s] (%d)!\n", name, err);
+                       return err;
+               }
+       }
+
+       err = vhd_check_version(s);
+       if (err)
+               goto fail;
+
+       s->spb = s->spp = 1;
+
+       if (vhd_type_dynamic(&s->vhd)) {
+               err = vhd_initialize_dynamic_disk(s);
+               if (err)
+                       goto fail;
+       }
+
+       vhd_log_open(s);
+
+       SPB = s->spb;
+
+       s->vreq_free_count = VHD_REQS_DATA;
+       for (i = 0; i < VHD_REQS_DATA; i++)
+               s->vreq_free[i] = s->vreq_list + i;
+
+       driver->info.size        = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT;
+       driver->info.sector_size = VHD_SECTOR_SIZE;
+       driver->info.info        = 0;
+
+        DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%"PRIu64
+            ", inf:%u)\n",
+           driver->info.size, driver->info.sector_size, driver->info.info);
+
+       if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) && 
+           !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) {
+               err = vhd_kill_footer(s);
+               if (err) {
+                       DPRINTF("ERROR killing footer: %d\n", err);
+                       goto fail;
+               }
+               s->writes++;
+       }
+
+        return 0;
+
+ fail:
+       vhd_free_bat(s);
+       vhd_free_bitmap_cache(s);
+       vhd_close(&s->vhd);
+       vhd_free(s);
+       return err;
+}
+
+static int
+_vhd_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       vhd_flag_t vhd_flags = 0;
+
+       if (flags & TD_OPEN_RDONLY)
+               vhd_flags |= VHD_FLAG_OPEN_RDONLY;
+       if (flags & TD_OPEN_QUIET)
+               vhd_flags |= VHD_FLAG_OPEN_QUIET;
+       if (flags & TD_OPEN_STRICT)
+               vhd_flags |= VHD_FLAG_OPEN_STRICT;
+       if (flags & TD_OPEN_QUERY)
+               vhd_flags |= (VHD_FLAG_OPEN_QUERY  |
+                             VHD_FLAG_OPEN_QUIET  |
+                             VHD_FLAG_OPEN_RDONLY |
+                             VHD_FLAG_OPEN_NO_CACHE);
+
+       /* pre-allocate for all but NFS and LVM storage */
+       if (driver->storage != TAPDISK_STORAGE_TYPE_NFS &&
+           driver->storage != TAPDISK_STORAGE_TYPE_LVM)
+               vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE;
+
+       return __vhd_open(driver, name, vhd_flags);
+}
+
+static void
+vhd_log_close(struct vhd_state *s)
+{
+       uint32_t i, allocated, full;
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+               return;
+
+       allocated = 0;
+       full      = 0;
+
+       for (i = 0; i < s->bat.bat.entries; i++) {
+               if (bat_entry(s, i) != DD_BLK_UNUSED)
+                       allocated++;
+               if (test_batmap(s, i))
+                       full++;
+       }
+
+       DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n",
+               s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db);
+}
+
+static int
+_vhd_close(td_driver_t *driver)
+{
+       int err;
+       struct vhd_state *s;
+       struct vhd_bitmap *bm;
+       
+       DBG(TLOG_WARN, "vhd_close\n");
+       s = (struct vhd_state *)driver->data;
+
+       /* don't write footer if tapdisk is read-only */
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY))
+               goto free;
+       
+       /* 
+        * write footer if:
+        *   - we killed it on open (opened with strict) 
+        *   - we've written data since opening
+        */
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) {
+               memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t));
+               err = vhd_write_footer(&s->vhd, &s->vhd.footer);
+               memset(&s->vhd.bat, 0, sizeof(vhd_bat_t));
+
+               if (err)
+                       EPRINTF("writing %s footer: %d\n", s->vhd.file, err);
+
+               if (!vhd_has_batmap(&s->vhd))
+                       goto free;
+
+               err = vhd_write_batmap(&s->vhd, &s->bat.batmap);
+               if (err)
+                       EPRINTF("writing %s batmap: %d\n", s->vhd.file, err);
+       }
+
+ free:
+       vhd_log_close(s);
+       vhd_free_bat(s);
+       vhd_free_bitmap_cache(s);
+       vhd_close(&s->vhd);
+       vhd_free(s);
+
+       memset(s, 0, sizeof(struct vhd_state));
+
+       return 0;
+}
+
+int
+vhd_validate_parent(td_driver_t *child_driver,
+                   td_driver_t *parent_driver, td_flag_t flags)
+{
+       uint32_t status;
+       struct stat stats;
+       struct vhd_state *child  = (struct vhd_state *)child_driver->data;
+       struct vhd_state *parent;
+
+       if (parent_driver->type != DISK_TYPE_VHD) {
+               if (child_driver->type != DISK_TYPE_VHD)
+                       return -EINVAL;
+               if (child->vhd.footer.type != HD_TYPE_DIFF)
+                       return -EINVAL;
+               if (!vhd_parent_raw(&child->vhd))
+                       return -EINVAL;
+               return 0;
+       }
+
+       parent = (struct vhd_state *)parent_driver->data;
+
+       /* 
+        * This check removed because of cases like:
+        *   - parent VHD marked as 'hidden'
+        *   - parent VHD modified during coalesce
+        */
+       /*
+       if (stat(parent->vhd.file, &stats)) {
+               DPRINTF("ERROR stating parent file %s\n", parent->vhd.file);
+               return -errno;
+       }
+
+       if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) {
+               DPRINTF("ERROR: parent file has been modified since "
+                       "snapshot.  Child image no longer valid.\n");
+               return -EINVAL;
+       }
+       */
+
+       if (vhd_uuid_compare(&child->vhd.header.prt_uuid, &parent->vhd.footer.uuid)) {
+               DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since "
+                       "snapshot.  Child image no longer valid.\n",
+                       __func__, child->vhd.file, parent->vhd.file);
+               return -EINVAL;
+       }
+
+       /* TODO: compare sizes */
+       
+       return 0;
+}
+
+int
+vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       int err;
+       char *parent;
+       struct vhd_state *s;
+
+       DBG(TLOG_DBG, "\n");
+       memset(id, 0, sizeof(td_disk_id_t));
+
+       s = (struct vhd_state *)driver->data;
+
+       if (s->vhd.footer.type != HD_TYPE_DIFF)
+               return TD_NO_PARENT;
+
+       err = vhd_parent_locator_get(&s->vhd, &parent);
+       if (err)
+               return err;
+
+       id->name       = parent;
+       id->drivertype = DISK_TYPE_VHD;
+       if (vhd_parent_raw(&s->vhd)) {
+               DPRINTF("VHD: parent is raw\n");
+               id->drivertype = DISK_TYPE_AIO;
+       }
+       return 0;
+}
+
+static inline void
+clear_req_list(struct vhd_req_list *list)
+{
+       list->head = list->tail = NULL;
+}
+
+static inline void
+add_to_tail(struct vhd_req_list *list, struct vhd_request *e)
+{
+       if (!list->head) 
+               list->head = list->tail = e;
+       else 
+               list->tail = list->tail->next = e;
+}
+
+static inline int
+remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e)
+{
+       struct vhd_request *i = list->head;
+
+       if (list->head == e) {
+               if (list->tail == e)
+                       clear_req_list(list);
+               else
+                       list->head = list->head->next;
+               return 0;
+       }
+
+       while (i->next) {
+               if (i->next == e) {
+                       if (list->tail == e) {
+                               i->next = NULL;
+                               list->tail = i;
+                       } else
+                               i->next = i->next->next;
+                       return 0;
+               }
+               i = i->next;
+       }
+
+       return -EINVAL;
+}
+
+static inline void
+init_vhd_request(struct vhd_state *s, struct vhd_request *req)
+{
+       memset(req, 0, sizeof(struct vhd_request));
+       req->state = s;
+}
+
+static inline void
+init_tx(struct vhd_transaction *tx)
+{
+       memset(tx, 0, sizeof(struct vhd_transaction));
+}
+
+static inline void
+add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r)
+{
+       ASSERT(!tx->closed);
+
+       r->tx = tx;
+       tx->started++;
+       add_to_tail(&tx->requests, r);
+       set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE);
+
+       DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, "
+           "started: %d, finished: %d, status: %u\n",
+           r->treq.sec / SPB, r->treq.sec, tx,
+           tx->started, tx->finished, tx->status);
+}
+
+static inline int
+transaction_completed(struct vhd_transaction *tx)
+{
+       return (tx->started == tx->finished);
+}
+
+static inline void
+init_bat(struct vhd_state *s)
+{
+       s->bat.req.tx     = NULL;
+       s->bat.req.next   = NULL;
+       s->bat.req.error  = 0;
+       s->bat.pbw_blk    = 0;
+       s->bat.pbw_offset = 0;
+       s->bat.status     = 0;
+}
+
+static inline void
+lock_bat(struct vhd_state *s)
+{
+       set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline void
+unlock_bat(struct vhd_state *s)
+{
+       clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline int
+bat_locked(struct vhd_state *s)
+{
+       return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline void
+init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       bm->blk    = 0;
+       bm->seqno  = 0;
+       bm->status = 0;
+       init_tx(&bm->tx);
+       clear_req_list(&bm->queue);
+       clear_req_list(&bm->waiting);
+       memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs));
+       memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs));
+       init_vhd_request(s, &bm->req);
+}
+
+static inline struct vhd_bitmap *
+get_bitmap(struct vhd_state *s, uint32_t block)
+{
+       int i;
+       struct vhd_bitmap *bm;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap[i];
+               if (bm && bm->blk == block)
+                       return bm;
+       }
+
+       return NULL;
+}
+
+static inline void
+lock_bitmap(struct vhd_bitmap *bm)
+{
+       set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline void
+unlock_bitmap(struct vhd_bitmap *bm)
+{
+       clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline int
+bitmap_locked(struct vhd_bitmap *bm)
+{
+       return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline int
+bitmap_valid(struct vhd_bitmap *bm)
+{
+       return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+}
+
+static inline int
+bitmap_in_use(struct vhd_bitmap *bm)
+{
+       return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)  ||
+               test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) ||
+               test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) ||
+               bm->waiting.head || bm->tx.requests.head || bm->queue.head);
+}
+
+static inline int
+bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i, n;
+
+       n = s->spb >> 3;
+       for (i = 0; i < n; i++)
+               if (bm->map[i] != (char)0xFF)
+                       return 0;
+
+       DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk);
+       return 1;
+}
+
+static struct vhd_bitmap *
+remove_lru_bitmap(struct vhd_state *s)
+{
+       int i, idx = 0;
+       u64 seq = s->bm_lru;
+       struct vhd_bitmap *bm, *lru = NULL;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap[i];
+               if (bm && bm->seqno < seq && !bitmap_locked(bm)) {
+                       idx = i;
+                       lru = bm;
+                       seq = lru->seqno;
+               }
+       }
+
+       if (lru) {
+               s->bitmap[idx] = NULL;
+               ASSERT(!bitmap_in_use(lru));
+       }
+
+       return  lru;
+}
+
+static int
+alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk)
+{
+       struct vhd_bitmap *bm;
+       
+       *bitmap = NULL;
+
+       if (s->bm_free_count > 0) {
+               bm = s->bitmap_free[--s->bm_free_count];
+       } else {
+               bm = remove_lru_bitmap(s);
+               if (!bm)
+                       return -EBUSY;
+       }
+
+       init_vhd_bitmap(s, bm);
+       bm->blk = blk;
+       *bitmap = bm;
+
+       return 0;
+}
+
+static inline uint64_t
+__bitmap_lru_seqno(struct vhd_state *s)
+{
+       int i;
+       struct vhd_bitmap *bm;
+
+       if (s->bm_lru == 0xffffffff) {
+               s->bm_lru = 0;
+               for (i = 0; i < VHD_CACHE_SIZE; i++) {
+                       bm = s->bitmap[i];
+                       if (bm) {
+                               bm->seqno >>= 1;
+                               if (bm->seqno > s->bm_lru)
+                                       s->bm_lru = bm->seqno;
+                       }
+               }
+       }
+
+       return ++s->bm_lru;
+}
+
+static inline void
+touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       bm->seqno = __bitmap_lru_seqno(s);
+}
+
+static inline void
+install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i;
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               if (!s->bitmap[i]) {
+                       touch_bitmap(s, bm);
+                       s->bitmap[i] = bm;
+                       return;
+               }
+       }
+
+       ASSERT(0);
+}
+
+static inline void
+free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++)
+               if (s->bitmap[i] == bm)
+                       break;
+
+       ASSERT(!bitmap_locked(bm));
+       ASSERT(!bitmap_in_use(bm));
+       ASSERT(i < VHD_CACHE_SIZE);
+
+       s->bitmap[i] = NULL;
+       s->bitmap_free[s->bm_free_count++] = bm;
+}
+
+static int
+read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op)
+{
+       u32 blk, sec;
+       struct vhd_bitmap *bm;
+
+       /* in fixed disks, every block is present */
+       if (s->vhd.footer.type == HD_TYPE_FIXED) 
+               return VHD_BM_BIT_SET;
+
+       blk = sector / s->spb;
+       sec = sector % s->spb;
+
+       if (blk > s->vhd.header.max_bat_size) {
+               DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n",
+                       sector, op);
+               return -EINVAL;
+       }
+
+       if (bat_entry(s, blk) == DD_BLK_UNUSED) {
+               if (op == VHD_OP_DATA_WRITE &&
+                   s->bat.pbw_blk != blk && bat_locked(s))
+                       return VHD_BM_BAT_LOCKED;
+
+               return VHD_BM_BAT_CLEAR;
+       }
+
+       if (test_batmap(s, blk)) {
+               DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk);
+               return VHD_BM_BIT_SET;
+       }
+
+       bm = get_bitmap(s, blk);
+       if (!bm)
+               return VHD_BM_NOT_CACHED;
+
+       /* bump lru count */
+       touch_bitmap(s, bm);
+
+       if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING))
+               return VHD_BM_READ_PENDING;
+
+       return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ? 
+               VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR);
+}
+
+static int
+read_bitmap_cache_span(struct vhd_state *s, 
+                      uint64_t sector, int nr_secs, int value)
+{
+       int ret;
+       u32 blk, sec;
+       struct vhd_bitmap *bm;
+
+       /* in fixed disks, every block is present */
+       if (s->vhd.footer.type == HD_TYPE_FIXED) 
+               return nr_secs;
+
+       sec = sector % s->spb;
+       blk = sector / s->spb;
+
+       if (test_batmap(s, blk))
+               return MIN(nr_secs, s->spb - sec);
+
+       bm  = get_bitmap(s, blk);
+       
+       ASSERT(bm && bitmap_valid(bm));
+
+       for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++)
+               if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value)
+                       break;
+
+       return ret;
+}
+
+static inline struct vhd_request *
+alloc_vhd_request(struct vhd_state *s)
+{
+       struct vhd_request *req = NULL;
+       
+       if (s->vreq_free_count > 0) {
+               req = s->vreq_free[--s->vreq_free_count];
+               ASSERT(req->treq.secs == 0);
+               init_vhd_request(s, req);
+               return req;
+       }
+
+       return NULL;
+}
+
+static inline void
+free_vhd_request(struct vhd_state *s, struct vhd_request *req)
+{
+       memset(req, 0, sizeof(struct vhd_request));
+       s->vreq_free[s->vreq_free_count++] = req;
+}
+
+static inline void
+aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
+{
+       struct tiocb *tiocb = &req->tiocb;
+
+       td_prep_read(tiocb, s->vhd.fd, req->treq.buf,
+                    vhd_sectors_to_bytes(req->treq.secs),
+                    offset, vhd_complete, req);
+       td_queue_tiocb(s->driver, tiocb);
+
+       s->queued++;
+       s->reads++;
+       s->read_size += req->treq.secs;
+       TRACE(s);
+}
+
+static inline void
+aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
+{
+       struct tiocb *tiocb = &req->tiocb;
+
+       td_prep_write(tiocb, s->vhd.fd, req->treq.buf,
+                     vhd_sectors_to_bytes(req->treq.secs),
+                     offset, vhd_complete, req);
+       td_queue_tiocb(s->driver, tiocb);
+
+       s->queued++;
+       s->writes++;
+       s->write_size += req->treq.secs;
+       TRACE(s);
+}
+
+static inline uint64_t
+reserve_new_block(struct vhd_state *s, uint32_t blk)
+{
+       int gap = 0;
+
+       ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
+
+       /* data region of segment should begin on page boundary */
+       if ((s->next_db + s->bm_secs) % s->spp)
+               gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
+
+       s->bat.pbw_blk    = blk;
+       s->bat.pbw_offset = s->next_db + gap;
+
+       return s->next_db;
+}
+
+static int
+schedule_bat_write(struct vhd_state *s)
+{
+       int i;
+       u32 blk;
+       char *buf;
+       u64 offset;
+       struct vhd_request *req;
+
+       ASSERT(bat_locked(s));
+
+       req = &s->bat.req;
+       buf = s->bat.bat_buf;
+       blk = s->bat.pbw_blk;
+
+       init_vhd_request(s, req);
+       memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512);
+
+       ((u32 *)buf)[blk % 128] = s->bat.pbw_offset;
+
+       for (i = 0; i < 128; i++)
+               BE32_OUT(&((u32 *)buf)[i]);
+
+       offset         = s->vhd.header.table_offset + (blk - (blk % 128)) * 4;
+       req->treq.secs = 1;
+       req->treq.buf  = buf;
+       req->op        = VHD_OP_BAT_WRITE;
+       req->next      = NULL;
+
+       aio_write(s, req, offset);
+       set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED);
+
+       DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", "
+           "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset);
+
+       return 0;
+}
+
+static void
+schedule_zero_bm_write(struct vhd_state *s,
+                      struct vhd_bitmap *bm, uint64_t lb_end)
+{
+       uint64_t offset;
+       struct vhd_request *req = &s->bat.zero_req;
+
+       init_vhd_request(s, req);
+
+       offset         = vhd_sectors_to_bytes(lb_end);
+       req->op        = VHD_OP_ZERO_BM_WRITE;
+       req->treq.sec  = s->bat.pbw_blk * s->spb;
+       req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs;
+       req->treq.buf  = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs));
+       req->next      = NULL;
+
+       DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n",
+           s->bat.pbw_blk, offset);
+
+       lock_bitmap(bm);
+       add_to_transaction(&bm->tx, req);
+       aio_write(s, req, offset);
+}
+
+static int
+update_bat(struct vhd_state *s, uint32_t blk)
+{
+       int err;
+       uint64_t lb_end;
+       struct vhd_bitmap *bm;
+
+       ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
+       
+       if (bat_locked(s)) {
+               ASSERT(s->bat.pbw_blk == blk);
+               return 0;
+       }
+
+       /* empty bitmap could already be in
+        * cache if earlier bat update failed */
+       bm = get_bitmap(s, blk);
+       if (!bm) {
+               /* install empty bitmap in cache */
+               err = alloc_vhd_bitmap(s, &bm, blk);
+               if (err) 
+                       return err;
+
+               install_bitmap(s, bm);
+       }
+
+       lock_bat(s);
+       lb_end = reserve_new_block(s, blk);
+       schedule_zero_bm_write(s, bm, lb_end);
+       set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT);
+
+       return 0;
+}
+
+static int
+allocate_block(struct vhd_state *s, uint32_t blk)
+{
+       char *zeros;
+       int err, gap;
+       uint64_t offset, size;
+       struct vhd_bitmap *bm;
+
+       ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
+
+       if (bat_locked(s)) {
+               ASSERT(s->bat.pbw_blk == blk);
+               if (s->bat.req.error)
+                       return -EBUSY;
+               return 0;
+       }
+
+       gap            = 0;
+       s->bat.pbw_blk = blk;
+       offset         = vhd_sectors_to_bytes(s->next_db);
+
+       /* data region of segment should begin on page boundary */
+       if ((s->next_db + s->bm_secs) % s->spp) {
+               gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
+               s->next_db += gap;
+       }
+
+       s->bat.pbw_offset = s->next_db;
+
+       DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n",
+           blk, s->bat.pbw_offset);
+
+       if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) {
+               ERR(errno, "lseek failed\n");
+               return -errno;
+       }
+
+       size = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap);
+       err  = write(s->vhd.fd, vhd_zeros(size), size);
+       if (err != size) {
+               err = (err == -1 ? -errno : -EIO);
+               ERR(err, "write failed");
+               return err;
+       }
+
+       /* empty bitmap could already be in
+        * cache if earlier bat update failed */
+       bm = get_bitmap(s, blk);
+       if (!bm) {
+               /* install empty bitmap in cache */
+               err = alloc_vhd_bitmap(s, &bm, blk);
+               if (err) 
+                       return err;
+
+               install_bitmap(s, bm);
+       }
+
+       lock_bat(s);
+       lock_bitmap(bm);
+       schedule_bat_write(s);
+       add_to_transaction(&bm->tx, &s->bat.req);
+
+       return 0;
+}
+
+static int 
+schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
+{
+       u64 offset;
+       u32 blk = 0, sec = 0;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req;
+
+       if (s->vhd.footer.type == HD_TYPE_FIXED) {
+               offset = vhd_sectors_to_bytes(treq.sec);
+               goto make_request;
+       }
+
+       blk    = treq.sec / s->spb;
+       sec    = treq.sec % s->spb;
+       bm     = get_bitmap(s, blk);
+       offset = bat_entry(s, blk);
+
+       ASSERT(offset != DD_BLK_UNUSED);
+       ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm)));
+
+       offset += s->bm_secs + sec;
+       offset  = vhd_sectors_to_bytes(offset);
+
+ make_request:
+       req = alloc_vhd_request(s);
+       if (!req) 
+               return -EBUSY;
+
+       req->treq  = treq;
+       req->flags = flags;
+       req->op    = VHD_OP_DATA_READ;
+       req->next  = NULL;
+
+       aio_read(s, req, offset);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
+           "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n",
+           s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags,
+           treq.buf);
+
+       return 0;
+}
+
+static int
+schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
+{
+       int err;
+       u64 offset;
+       u32 blk = 0, sec = 0;
+       struct vhd_bitmap  *bm = NULL;
+       struct vhd_request *req;
+
+       if (s->vhd.footer.type == HD_TYPE_FIXED) {
+               offset = vhd_sectors_to_bytes(treq.sec);
+               goto make_request;
+       }
+
+       blk    = treq.sec / s->spb;
+       sec    = treq.sec % s->spb;
+       offset = bat_entry(s, blk);
+
+       if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) {
+               if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
+                       err = allocate_block(s, blk);
+               else
+                       err = update_bat(s, blk);
+
+               if (err)
+                       return err;
+
+               offset = s->bat.pbw_offset;
+       }
+
+       offset += s->bm_secs + sec;
+       offset  = vhd_sectors_to_bytes(offset);
+
+ make_request:
+       req = alloc_vhd_request(s);
+       if (!req)
+               return -EBUSY;
+
+       req->treq  = treq;
+       req->flags = flags;
+       req->op    = VHD_OP_DATA_WRITE;
+       req->next  = NULL;
+
+       if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) {
+               bm = get_bitmap(s, blk);
+               ASSERT(bm && bitmap_valid(bm));
+               lock_bitmap(bm);
+
+               if (bm->tx.closed) {
+                       add_to_tail(&bm->queue, req);
+                       set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED);
+               } else
+                       add_to_transaction(&bm->tx, req);
+       }
+
+       aio_write(s, req, offset);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
+           "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n",
+           s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags);
+
+       return 0;
+}
+
+static int 
+schedule_bitmap_read(struct vhd_state *s, uint32_t blk)
+{
+       int err;
+       u64 offset;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req = NULL;
+
+       ASSERT(vhd_type_dynamic(&s->vhd));
+
+       offset = bat_entry(s, blk);
+
+       ASSERT(offset != DD_BLK_UNUSED);
+       ASSERT(!get_bitmap(s, blk));
+
+       offset = vhd_sectors_to_bytes(offset);
+
+       err = alloc_vhd_bitmap(s, &bm, blk);
+       if (err)
+               return err;
+
+       req = &bm->req;
+       init_vhd_request(s, req);
+
+       req->treq.sec  = blk * s->spb;
+       req->treq.secs = s->bm_secs;
+       req->treq.buf  = bm->map;
+       req->treq.cb   = NULL;
+       req->op        = VHD_OP_BITMAP_READ;
+       req->next      = NULL;
+
+       aio_read(s, req, offset);
+       lock_bitmap(bm);
+       install_bitmap(s, bm);
+       set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, "
+           "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk,
+           req->treq.secs, offset);
+
+       return 0;
+}
+
+static void
+schedule_bitmap_write(struct vhd_state *s, uint32_t blk)
+{
+       u64 offset;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req;
+
+       bm     = get_bitmap(s, blk);
+       offset = bat_entry(s, blk);
+
+       ASSERT(vhd_type_dynamic(&s->vhd));
+       ASSERT(bm && bitmap_valid(bm) &&
+              !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
+
+       if (offset == DD_BLK_UNUSED) {
+               ASSERT(bat_locked(s) && s->bat.pbw_blk == blk);
+               offset = s->bat.pbw_offset;
+       }
+       
+       offset = vhd_sectors_to_bytes(offset);
+
+       req = &bm->req;
+       init_vhd_request(s, req);
+
+       req->treq.sec  = blk * s->spb;
+       req->treq.secs = s->bm_secs;
+       req->treq.buf  = bm->shadow;
+       req->treq.cb   = NULL;
+       req->op        = VHD_OP_BITMAP_WRITE;
+       req->next      = NULL;
+
+       aio_write(s, req, offset);
+       lock_bitmap(bm);
+       touch_bitmap(s, bm);     /* bump lru count */
+       set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
+
+       DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, "
+           "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec,
+           req->treq.secs, offset);
+}
+
+/* 
+ * queued requests will be submitted once the bitmap
+ * describing them is read and the requests are validated. 
+ */
+static int
+__vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq)
+{
+       u32 blk;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req;
+
+       ASSERT(vhd_type_dynamic(&s->vhd));
+
+       blk = treq.sec / s->spb;
+       bm  = get_bitmap(s, blk);
+
+       ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
+
+       req = alloc_vhd_request(s);
+       if (!req)
+               return -EBUSY;
+
+       req->treq = treq;
+       req->op   = op;
+       req->next = NULL;
+
+       add_to_tail(&bm->waiting, req);
+       lock_bitmap(bm);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, "
+           "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op);
+
+       TRACE(s);
+       return 0;
+}
+
+static void
+vhd_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct vhd_state *s = (struct vhd_state *)driver->data;
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n",
+           s->vhd.file, treq.sec, treq.secs, treq.sidx);
+
+       while (treq.secs) {
+               int err;
+               td_request_t clone;
+
+               err   = 0;
+               clone = treq;
+
+               switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) {
+               case -EINVAL:
+                       err = -EINVAL;
+                       goto fail;
+
+               case VHD_BM_BAT_CLEAR:
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       td_forward_request(clone);
+                       break;
+
+               case VHD_BM_BIT_CLEAR:
+                       clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
+                       td_forward_request(clone);
+                       break;
+
+               case VHD_BM_BIT_SET:
+                       clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
+                       err = schedule_data_read(s, clone, 0);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_NOT_CACHED:
+                       err = schedule_bitmap_read(s, clone.sec / s->spb);
+                       if (err)
+                               goto fail;
+
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_READ_PENDING:
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_BAT_LOCKED:
+               default:
+                       ASSERT(0);
+                       break;
+               }
+
+               treq.sec  += clone.secs;
+               treq.secs -= clone.secs;
+               treq.buf  += vhd_sectors_to_bytes(clone.secs);
+               continue;
+
+       fail:
+               clone.secs = treq.secs;
+               td_complete_request(clone, err);
+               break;
+       }
+}
+
+static void
+vhd_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct vhd_state *s = (struct vhd_state *)driver->data;
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x, (seg: %d)\n",
+           s->vhd.file, treq.sec, treq.secs, treq.sidx);
+
+       while (treq.secs) {
+               int err;
+               uint8_t flags;
+               td_request_t clone;
+
+               err   = 0;
+               flags = 0;
+               clone = treq;
+
+               switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_WRITE)) {
+               case -EINVAL:
+                       err = -EINVAL;
+                       goto fail;
+
+               case VHD_BM_BAT_LOCKED:
+                       err = -EBUSY;
+                       clone.blocked = 1;
+                       goto fail;
+
+               case VHD_BM_BAT_CLEAR:
+                       flags      = (VHD_FLAG_REQ_UPDATE_BAT |
+                                     VHD_FLAG_REQ_UPDATE_BITMAP);
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       err        = schedule_data_write(s, clone, flags);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_BIT_CLEAR:
+                       flags      = VHD_FLAG_REQ_UPDATE_BITMAP;
+                       clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
+                       err        = schedule_data_write(s, clone, flags);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_BIT_SET:
+                       clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
+                       err = schedule_data_write(s, clone, 0);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_NOT_CACHED:
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       err = schedule_bitmap_read(s, clone.sec / s->spb);
+                       if (err)
+                               goto fail;
+
+                       err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_READ_PENDING:
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
+                       if (err)
+                               goto fail;
+                       break;
+
+               default:
+                       ASSERT(0);
+                       break;
+               }
+
+               treq.sec  += clone.secs;
+               treq.secs -= clone.secs;
+               treq.buf  += vhd_sectors_to_bytes(clone.secs);
+               continue;
+
+       fail:
+               clone.secs = treq.secs;
+               td_complete_request(clone, err);
+               break;
+       }
+}
+
+static inline void
+signal_completion(struct vhd_request *list, int error)
+{
+       struct vhd_state *s;
+       struct vhd_request *r, *next;
+
+       if (!list)
+               return;
+
+       r = list;
+       s = list->state;
+
+       while (r) {
+               int err;
+
+               err  = (error ? error : r->error);
+               next = r->next;
+               td_complete_request(r->treq, err);
+               DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64", "
+                   "err: %d\n", r->treq.sec, r->treq.sec / s->spb, err);
+               free_vhd_request(s, r);
+               r    = next;
+
+               s->returned++;
+               TRACE(s);
+       }
+}
+
+static void
+start_new_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i, error = 0;
+       struct vhd_transaction *tx;
+       struct vhd_request *r, *next;
+
+       if (!bm->queue.head)
+               return;
+
+       DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+
+       r  = bm->queue.head;
+       tx = &bm->tx;
+       clear_req_list(&bm->queue);
+
+       if (r && bat_entry(s, bm->blk) == DD_BLK_UNUSED)
+               tx->error = -EIO;
+
+       while (r) {
+               next    = r->next;
+               r->next = NULL;
+               clear_vhd_flag(r->flags, VHD_FLAG_REQ_QUEUED);
+
+               add_to_transaction(tx, r);
+               if (test_vhd_flag(r->flags, VHD_FLAG_REQ_FINISHED)) {
+                       tx->finished++;
+                       if (!r->error) {
+                               u32 sec = r->treq.sec % s->spb;
+                               for (i = 0; i < r->treq.secs; i++)
+                                       vhd_bitmap_set(&s->vhd,
+                                                      bm->shadow, sec + i);
+                       }
+               }
+               r = next;
+       }
+
+       /* perhaps all the queued writes already completed? */
+       if (tx->started && transaction_completed(tx))
+               finish_data_transaction(s, bm);
+}
+
+static void
+finish_bat_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       struct vhd_transaction *tx = &bm->tx;
+
+       if (!bat_locked(s))
+               return;
+
+       if (s->bat.pbw_blk != bm->blk)
+               return;
+
+       if (!s->bat.req.error)
+               goto release;
+
+       if (!test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE))
+               goto release;
+
+       tx->closed = 1;
+       return;
+
+ release:
+       DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+       unlock_bat(s);
+       init_bat(s);
+}
+
+static void
+finish_bitmap_transaction(struct vhd_state *s,
+                         struct vhd_bitmap *bm, int error)
+{
+       int map_size;
+       struct vhd_transaction *tx = &bm->tx;
+
+       DBG(TLOG_DBG, "blk: 0x%04x, err: %d\n", bm->blk, error);
+       tx->error = (tx->error ? tx->error : error);
+       map_size  = vhd_sectors_to_bytes(s->bm_secs);
+
+       if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
+               if (test_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT)) {
+                       /* still waiting for bat write */
+                       ASSERT(bm->blk == s->bat.pbw_blk);
+                       ASSERT(test_vhd_flag(s->bat.status, 
+                                            VHD_FLAG_BAT_WRITE_STARTED));
+                       s->bat.req.tx = tx;
+                       return;
+               }
+       }
+
+       if (tx->error) {
+               /* undo changes to shadow */
+               memcpy(bm->shadow, bm->map, map_size);
+       } else {
+               /* complete atomic write */
+               memcpy(bm->map, bm->shadow, map_size);
+               if (!test_batmap(s, bm->blk) && bitmap_full(s, bm))
+                       set_batmap(s, bm->blk);
+       }
+
+       /* transaction done; signal completions */
+       signal_completion(tx->requests.head, tx->error);
+       init_tx(tx);
+       start_new_bitmap_transaction(s, bm);
+
+       if (!bitmap_in_use(bm))
+               unlock_bitmap(bm);
+
+       finish_bat_transaction(s, bm);
+}
+
+static void
+finish_data_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       struct vhd_transaction *tx = &bm->tx;
+
+       DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+
+       tx->closed = 1;
+
+       if (!tx->error)
+               return schedule_bitmap_write(s, bm->blk);
+
+       return finish_bitmap_transaction(s, bm, 0);
+}
+
+static void
+finish_bat_write(struct vhd_request *req)
+{
+       struct vhd_bitmap *bm;
+       struct vhd_transaction *tx;
+       struct vhd_state *s = req->state;
+
+       s->returned++;
+       TRACE(s);
+
+       bm = get_bitmap(s, s->bat.pbw_blk);
+
+       DBG(TLOG_DBG, "blk 0x%04x, pbwo: 0x%08"PRIx64", err %d\n",
+           s->bat.pbw_blk, s->bat.pbw_offset, req->error);
+       ASSERT(bm && bitmap_valid(bm));
+       ASSERT(bat_locked(s) &&
+              test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
+
+       tx = &bm->tx;
+       ASSERT(test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE));
+
+       if (!req->error) {
+               bat_entry(s, s->bat.pbw_blk) = s->bat.pbw_offset;
+               s->next_db = s->bat.pbw_offset + s->spb + s->bm_secs;
+       } else
+               tx->error = req->error;
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
+               tx->finished++;
+               remove_from_req_list(&tx->requests, req);
+               if (transaction_completed(tx))
+                       finish_data_transaction(s, bm);
+       } else {
+               clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
+               if (s->bat.req.tx)
+                       finish_bitmap_transaction(s, bm, req->error);
+       }
+
+       finish_bat_transaction(s, bm);
+}
+
+static void
+finish_zero_bm_write(struct vhd_request *req)
+{
+       u32 blk;
+       struct vhd_bitmap *bm;
+       struct vhd_transaction *tx = req->tx;
+       struct vhd_state *s = req->state;
+
+       s->returned++;
+       TRACE(s);
+
+       blk = req->treq.sec / s->spb;
+       bm  = get_bitmap(s, blk);
+
+       DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
+       ASSERT(bat_locked(s));
+       ASSERT(s->bat.pbw_blk == blk);
+       ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
+
+       tx->finished++;
+       remove_from_req_list(&tx->requests, req);
+
+       if (req->error) {
+               unlock_bat(s);
+               init_bat(s);
+               tx->error = req->error;
+               clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
+       } else
+               schedule_bat_write(s);
+
+       if (transaction_completed(tx))
+               finish_data_transaction(s, bm);
+}
+
+static void
+finish_bitmap_read(struct vhd_request *req)
+{
+       u32 blk;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *r, *next;
+       struct vhd_state   *s = req->state;
+
+       s->returned++;
+       TRACE(s);
+
+       blk = req->treq.sec / s->spb;
+       bm  = get_bitmap(s, blk);
+
+       DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
+       ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
+
+       r = bm->waiting.head;
+       clear_req_list(&bm->waiting);
+       clear_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+
+       if (!req->error) {
+               memcpy(bm->shadow, bm->map, vhd_sectors_to_bytes(s->bm_secs));
+
+               while (r) {
+                       struct vhd_request tmp;
+
+                       tmp  = *r;
+                       next =  r->next;
+                       free_vhd_request(s, r);
+
+                       ASSERT(tmp.op == VHD_OP_DATA_READ || 
+                              tmp.op == VHD_OP_DATA_WRITE);
+
+                       if (tmp.op == VHD_OP_DATA_READ)
+                               vhd_queue_read(s->driver, tmp.treq);
+                       else if (tmp.op == VHD_OP_DATA_WRITE)
+                               vhd_queue_write(s->driver, tmp.treq);
+
+                       r = next;
+               }
+       } else {
+               int err = req->error;
+               unlock_bitmap(bm);
+               free_vhd_bitmap(s, bm);
+               return signal_completion(r, err);
+       }
+
+       if (!bitmap_in_use(bm))
+               unlock_bitmap(bm);
+}
+
+static void
+finish_bitmap_write(struct vhd_request *req)
+{
+       u32 blk;
+       struct vhd_bitmap  *bm;
+       struct vhd_transaction *tx;
+       struct vhd_state *s = req->state;
+
+       s->returned++;
+       TRACE(s);
+
+       blk = req->treq.sec / s->spb;
+       bm  = get_bitmap(s, blk);
+       tx  = &bm->tx;
+
+       DBG(TLOG_DBG, "blk: 0x%04x, started: %d, finished: %d\n",
+           blk, tx->started, tx->finished);
+       ASSERT(tx->closed);
+       ASSERT(bm && bitmap_valid(bm));
+       ASSERT(test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
+
+       clear_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
+
+       finish_bitmap_transaction(s, bm, req->error);
+}
+
+static void
+finish_data_read(struct vhd_request *req)
+{
+       struct vhd_state *s = req->state;
+
+       DBG(TLOG_DBG, "lsec 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", 
+           req->treq.sec, req->treq.sec / s->spb);
+       signal_completion(req, 0);
+}
+
+static void
+finish_data_write(struct vhd_request *req)
+{
+       int i;
+       struct vhd_transaction *tx = req->tx;
+       struct vhd_state *s = (struct vhd_state *)req->state;
+
+       set_vhd_flag(req->flags, VHD_FLAG_REQ_FINISHED);
+
+       if (tx) {
+               u32 blk, sec;
+               struct vhd_bitmap *bm;
+
+               blk = req->treq.sec / s->spb;
+               sec = req->treq.sec % s->spb;
+               bm  = get_bitmap(s, blk);
+
+               ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
+
+               tx->finished++;
+
+               DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x04%"PRIx64", "
+                   "tx->started: %d, tx->finished: %d\n", req->treq.sec,
+                   req->treq.sec / s->spb, tx->started, tx->finished);
+
+               if (!req->error)
+                       for (i = 0; i < req->treq.secs; i++)
+                               vhd_bitmap_set(&s->vhd, bm->shadow,  sec + i);
+
+               if (transaction_completed(tx))
+                       finish_data_transaction(s, bm);
+
+       } else if (!test_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED)) {
+               ASSERT(!req->next);
+               DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", 
+                   req->treq.sec, req->treq.sec / s->spb);
+               signal_completion(req, 0);
+       }
+}
+
+void
+vhd_complete(void *arg, struct tiocb *tiocb, int err)
+{
+       struct vhd_request *req = (struct vhd_request *)arg;
+       struct vhd_state *s = req->state;
+       struct iocb *io = &tiocb->iocb;
+
+       s->completed++;
+       TRACE(s);
+
+       req->error = err;
+
+       if (req->error)
+               ERR(req->error, "%s: op: %u, lsec: %"PRIu64", secs: %u, "
+                   "nbytes: %lu, blk: %"PRIu64", blk_offset: %u",
+                   s->vhd.file, req->op, req->treq.sec, req->treq.secs,
+                   io->u.c.nbytes, req->treq.sec / s->spb,
+                   bat_entry(s, req->treq.sec / s->spb));
+
+       switch (req->op) {
+       case VHD_OP_DATA_READ:
+               finish_data_read(req);
+               break;
+
+       case VHD_OP_DATA_WRITE:
+               finish_data_write(req);
+               break;
+
+       case VHD_OP_BITMAP_READ:
+               finish_bitmap_read(req);
+               break;
+
+       case VHD_OP_BITMAP_WRITE:
+               finish_bitmap_write(req);
+               break;
+
+       case VHD_OP_ZERO_BM_WRITE:
+               finish_zero_bm_write(req);
+               break;
+
+       case VHD_OP_BAT_WRITE:
+               finish_bat_write(req);
+               break;
+
+       default:
+               ASSERT(0);
+               break;
+       }
+}
+
+void 
+vhd_debug(td_driver_t *driver)
+{
+       int i;
+       struct vhd_state *s = (struct vhd_state *)driver->data;
+
+       DBG(TLOG_WARN, "%s: QUEUED: 0x%08"PRIx64", COMPLETED: 0x%08"PRIx64", "
+           "RETURNED: 0x%08"PRIx64"\n", s->vhd.file, s->queued, s->completed,
+           s->returned);
+       DBG(TLOG_WARN, "WRITES: 0x%08"PRIx64", AVG_WRITE_SIZE: %f\n",
+           s->writes, (s->writes ? ((float)s->write_size / s->writes) : 0.0));
+       DBG(TLOG_WARN, "READS: 0x%08"PRIx64", AVG_READ_SIZE: %f\n",
+           s->reads, (s->reads ? ((float)s->read_size / s->reads) : 0.0));
+
+       DBG(TLOG_WARN, "ALLOCATED REQUESTS: (%lu total)\n", VHD_REQS_DATA);
+       for (i = 0; i < VHD_REQS_DATA; i++) {
+               struct vhd_request *r = &s->vreq_list[i];
+               td_request_t *t       = &r->treq;
+               if (t->secs)
+                       DBG(TLOG_WARN, "%d: id: 0x%04"PRIx64", err: %d, op: %d,"
+                           " lsec: 0x%08"PRIx64", flags: %d, this: %p, "
+                           "next: %p, tx: %p\n", i, t->id, r->error, r->op,
+                           t->sec, r->flags, r, r->next, r->tx);
+       }
+
+       DBG(TLOG_WARN, "BITMAP CACHE:\n");
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               int qnum = 0, wnum = 0, rnum = 0;
+               struct vhd_bitmap *bm = s->bitmap[i];
+               struct vhd_transaction *tx;
+               struct vhd_request *r;
+
+               if (!bm)
+                       continue;
+
+               tx = &bm->tx;
+               r = bm->queue.head;
+               while (r) {
+                       qnum++;
+                       r = r->next;
+               }
+
+               r = bm->waiting.head;
+               while (r) {
+                       wnum++;
+                       r = r->next;
+               }
+
+               r = tx->requests.head;
+               while (r) {
+                       rnum++;
+                       r = r->next;
+               }
+
+               DBG(TLOG_WARN, "%d: blk: 0x%04x, status: 0x%08x, q: %p, qnum: %d, w: %p, "
+                   "wnum: %d, locked: %d, in use: %d, tx: %p, tx_error: %d, "
+                   "started: %d, finished: %d, status: %u, reqs: %p, nreqs: %d\n",
+                   i, bm->blk, bm->status, bm->queue.head, qnum, bm->waiting.head,
+                   wnum, bitmap_locked(bm), bitmap_in_use(bm), tx, tx->error,
+                   tx->started, tx->finished, tx->status, tx->requests.head, rnum);
+       }
+
+       DBG(TLOG_WARN, "BAT: status: 0x%08x, pbw_blk: 0x%04x, "
+           "pbw_off: 0x%08"PRIx64", tx: %p\n", s->bat.status, s->bat.pbw_blk,
+           s->bat.pbw_offset, s->bat.req.tx);
+
+/*
+       for (i = 0; i < s->hdr.max_bat_size; i++)
+               DPRINTF("%d: %u\n", i, s->bat.bat[i]);
+*/
+}
+
+struct tap_disk tapdisk_vhd = {
+       .disk_type          = "tapdisk_vhd",
+       .flags              = 0,
+       .private_data_size  = sizeof(struct vhd_state),
+       .td_open            = _vhd_open,
+       .td_close           = _vhd_close,
+       .td_queue_read      = vhd_queue_read,
+       .td_queue_write     = vhd_queue_write,
+       .td_get_parent_id   = vhd_get_parent_id,
+       .td_validate_parent = vhd_validate_parent,
+       .td_debug           = vhd_debug,
+};
diff --git a/tools/blktap2/drivers/bswap.h b/tools/blktap2/drivers/bswap.h
new file mode 100644 (file)
index 0000000..7a2691b
--- /dev/null
@@ -0,0 +1,179 @@
+#ifndef BSWAP_H
+#define BSWAP_H
+
+//#include "config-host.h"
+
+#include <inttypes.h>
+
+#if defined(__NetBSD__)
+#include <sys/endian.h>
+#include <sys/types.h>
+#elif defined(__OpenBSD__)
+#include <machine/endian.h>
+#define bswap_16(x) swap16(x)
+#define bswap_32(x) swap32(x)
+#define bswap_64(x) swap64(x)
+#elif defined(__linux__)
+
+#include <endian.h>
+#include <byteswap.h>
+
+static inline uint16_t bswap16(uint16_t x)
+{
+    return bswap_16(x);
+}
+
+static inline uint32_t bswap32(uint32_t x) 
+{
+    return bswap_32(x);
+}
+
+static inline uint64_t bswap64(uint64_t x) 
+{
+    return bswap_64(x);
+}
+
+static inline void bswap16s(uint16_t *s)
+{
+    *s = bswap16(*s);
+}
+
+static inline void bswap32s(uint32_t *s)
+{
+    *s = bswap32(*s);
+}
+
+static inline void bswap64s(uint64_t *s)
+{
+    *s = bswap64(*s);
+}
+
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define be_bswap(v, size) (v)
+#define le_bswap(v, size) bswap ## size(v)
+#define be_bswaps(v, size)
+#define le_bswaps(p, size) *p = bswap ## size(*p);
+#else
+#define le_bswap(v, size) (v)
+#define be_bswap(v, size) bswap ## size(v)
+#define le_bswaps(v, size)
+#define be_bswaps(p, size) *p = bswap ## size(*p);
+#endif
+
+#define CPU_CONVERT(endian, size, type)\
+static inline type endian ## size ## _to_cpu(type v)\
+{\
+    return endian ## _bswap(v, size);\
+}\
+\
+static inline type cpu_to_ ## endian ## size(type v)\
+{\
+    return endian ## _bswap(v, size);\
+}\
+\
+static inline void endian ## size ## _to_cpus(type *p)\
+{\
+    endian ## _bswaps(p, size)\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## s(type *p)\
+{\
+    endian ## _bswaps(p, size)\
+}\
+\
+static inline type endian ## size ## _to_cpup(const type *p)\
+{\
+    return endian ## size ## _to_cpu(*p);\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## w(type *p, type v)\
+{\
+     *p = cpu_to_ ## endian ## size(v);\
+}
+
+CPU_CONVERT(be, 16, uint16_t)
+CPU_CONVERT(be, 32, uint32_t)
+CPU_CONVERT(be, 64, uint64_t)
+
+CPU_CONVERT(le, 16, uint16_t)
+CPU_CONVERT(le, 32, uint32_t)
+CPU_CONVERT(le, 64, uint64_t)
+
+/* unaligned versions (optimized for frequent unaligned accesses)*/
+
+#if defined(__i386__) || defined(__powerpc__)
+
+#define cpu_to_le16wu(p, v) cpu_to_le16w(p, v)
+#define cpu_to_le32wu(p, v) cpu_to_le32w(p, v)
+#define le16_to_cpupu(p) le16_to_cpup(p)
+#define le32_to_cpupu(p) le32_to_cpup(p)
+
+#define cpu_to_be16wu(p, v) cpu_to_be16w(p, v)
+#define cpu_to_be32wu(p, v) cpu_to_be32w(p, v)
+
+#else
+
+static inline void cpu_to_le16wu(uint16_t *p, uint16_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v;
+    p1[1] = v >> 8;
+}
+
+static inline void cpu_to_le32wu(uint32_t *p, uint32_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v;
+    p1[1] = v >> 8;
+    p1[2] = v >> 16;
+    p1[3] = v >> 24;
+}
+
+static inline uint16_t le16_to_cpupu(const uint16_t *p)
+{
+    const uint8_t *p1 = (const uint8_t *)p;
+    return p1[0] | (p1[1] << 8);
+}
+
+static inline uint32_t le32_to_cpupu(const uint32_t *p)
+{
+    const uint8_t *p1 = (const uint8_t *)p;
+    return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24);
+}
+
+static inline void cpu_to_be16wu(uint16_t *p, uint16_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v >> 8;
+    p1[1] = v;
+}
+
+static inline void cpu_to_be32wu(uint32_t *p, uint32_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v >> 24;
+    p1[1] = v >> 16;
+    p1[2] = v >> 8;
+    p1[3] = v;
+}
+
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define cpu_to_32wu cpu_to_be32wu
+#else
+#define cpu_to_32wu cpu_to_le32wu
+#endif
+
+#undef le_bswap
+#undef be_bswap
+#undef le_bswaps
+#undef be_bswaps
+
+#endif /* BSWAP_H */
diff --git a/tools/blktap2/drivers/check_gcrypt b/tools/blktap2/drivers/check_gcrypt
new file mode 100644 (file)
index 0000000..09d524b
--- /dev/null
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+cat > .gcrypt.c << EOF
+#include <gcrypt.h>
+int main(void) 
+{
+    gcry_md_hash_buffer(GCRY_MD_MD5, NULL, NULL, 0);
+    return 0; 
+}
+EOF
+
+if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then
+  echo "yes"
+else
+  echo "no"
+fi
+
+rm -f .gcrypt*
diff --git a/tools/blktap2/drivers/hashtable.c b/tools/blktap2/drivers/hashtable.c
new file mode 100644 (file)
index 0000000..90a6b85
--- /dev/null
@@ -0,0 +1,279 @@
+/* Copyright (C) 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+/*
+ * There are duplicates of this code in:
+ *  - tools/xenstore/hashtable.c
+ */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/*
+Credit for primes table: Aaron Krowne
+ http://br.endernet.org/~akrowne/
+ http://planetmath.org/encyclopedia/GoodHashTablePrimes.html
+*/
+static const unsigned int primes[] = {
+       53, 97, 193, 389,
+       769, 1543, 3079, 6151,
+       12289, 24593, 49157, 98317,
+       196613, 393241, 786433, 1572869,
+       3145739, 6291469, 12582917, 25165843,
+       50331653, 100663319, 201326611, 402653189,
+       805306457, 1610612741
+};
+const unsigned int prime_table_length = sizeof(primes)/sizeof(primes[0]);
+const float max_load_factor = 0.65;
+
+/*****************************************************************************/
+struct hashtable *
+create_hashtable(unsigned int minsize,
+                 unsigned int (*hashf) (void*),
+                 int (*eqf) (void*,void*))
+{
+       struct hashtable *h;
+       unsigned int pindex, size = primes[0];
+       /* Check requested hashtable isn't too large */
+       if (minsize > (1u << 30)) return NULL;
+       /* Enforce size as prime */
+       for (pindex=0; pindex < prime_table_length; pindex++) {
+               if (primes[pindex] > minsize) { size = primes[pindex]; break; }
+       }
+       h = (struct hashtable *)malloc(sizeof(struct hashtable));
+       if (NULL == h) return NULL; /*oom*/
+       h->table = (struct entry **)malloc(sizeof(struct entry*) * size);
+       if (NULL == h->table) { free(h); return NULL; } /*oom*/
+       memset(h->table, 0, size * sizeof(struct entry *));
+       h->tablelength  = size;
+       h->primeindex   = pindex;
+       h->entrycount   = 0;
+       h->hashfn       = hashf;
+       h->eqfn         = eqf;
+       h->loadlimit    = (unsigned int) ceil(size * max_load_factor);
+       return h;
+}
+
+/*****************************************************************************/
+unsigned int
+hash(struct hashtable *h, void *k)
+{
+       /* Aim to protect against poor hash functions by adding logic here
+        * - logic taken from java 1.4 hashtable source */
+       unsigned int i = h->hashfn(k);
+       i += ~(i << 9);
+       i ^=  ((i >> 14) | (i << 18)); /* >>> */
+       i +=  (i << 4);
+       i ^=  ((i >> 10) | (i << 22)); /* >>> */
+       return i;
+}
+
+/*****************************************************************************/
+static int
+hashtable_expand(struct hashtable *h)
+{
+       /* Double the size of the table to accomodate more entries */
+       struct entry **newtable;
+       struct entry *e;
+       struct entry **pE;
+       unsigned int newsize, i, index;
+       /* Check we're not hitting max capacity */
+       if (h->primeindex == (prime_table_length - 1)) return 0;
+       newsize = primes[++(h->primeindex)];
+
+       newtable = (struct entry **)malloc(sizeof(struct entry*) * newsize);
+       if (NULL != newtable)
+       {
+               memset(newtable, 0, newsize * sizeof(struct entry *));
+               /* This algorithm is not 'stable'. ie. it reverses the list
+                * when it transfers entries between the tables */
+               for (i = 0; i < h->tablelength; i++) {
+                       while (NULL != (e = h->table[i])) {
+                               h->table[i] = e->next;
+                               index = indexFor(newsize,e->h);
+                               e->next = newtable[index];
+                               newtable[index] = e;
+                       }
+               }
+               free(h->table);
+               h->table = newtable;
+       }
+       /* Plan B: realloc instead */
+       else
+       {
+               newtable = (struct entry **)
+                       realloc(h->table, newsize * sizeof(struct entry *));
+               if (NULL == newtable) { (h->primeindex)--; return 0; }
+               h->table = newtable;
+               memset(newtable[h->tablelength], 0, newsize - h->tablelength);
+               for (i = 0; i < h->tablelength; i++) {
+                       for (pE = &(newtable[i]), e = *pE; e != NULL; e = *pE) {
+                               index = indexFor(newsize,e->h);
+                               if (index == i)
+                               {
+                                       pE = &(e->next);
+                               }
+                               else
+                               {
+                                       *pE = e->next;
+                                       e->next = newtable[index];
+                                       newtable[index] = e;
+                               }
+                       }
+               }
+       }
+       h->tablelength = newsize;
+       h->loadlimit   = (unsigned int) ceil(newsize * max_load_factor);
+       return -1;
+}
+
+/*****************************************************************************/
+unsigned int
+hashtable_count(struct hashtable *h)
+{
+       return h->entrycount;
+}
+
+/*****************************************************************************/
+int
+hashtable_insert(struct hashtable *h, void *k, void *v)
+{
+       /* This method allows duplicate keys - but they shouldn't be used */
+       unsigned int index;
+       struct entry *e;
+       if (++(h->entrycount) > h->loadlimit)
+       {
+               /* Ignore the return value. If expand fails, we should
+                * still try cramming just this value into the existing table
+                * -- we may not have memory for a larger table, but one more
+                * element may be ok. Next time we insert, we'll try expanding again.*/
+               hashtable_expand(h);
+       }
+       e = (struct entry *)malloc(sizeof(struct entry));
+       if (NULL == e) { --(h->entrycount); return 0; } /*oom*/
+       e->h = hash(h,k);
+       index = indexFor(h->tablelength,e->h);
+       e->k = k;
+       e->v = v;
+       e->next = h->table[index];
+       h->table[index] = e;
+       return -1;
+}
+
+/*****************************************************************************/
+void * /* returns value associated with key */
+hashtable_search(struct hashtable *h, void *k)
+{
+       struct entry *e;
+       unsigned int hashvalue, index;
+       hashvalue = hash(h,k);
+       index = indexFor(h->tablelength,hashvalue);
+       e = h->table[index];
+       while (NULL != e)
+       {
+               /* Check hash value to short circuit heavier comparison */
+               if ((hashvalue == e->h) && (h->eqfn(k, e->k))) return e->v;
+               e = e->next;
+       }
+       return NULL;
+}
+
+/*****************************************************************************/
+void * /* returns value associated with key */
+hashtable_remove(struct hashtable *h, void *k)
+{
+       /* TODO: consider compacting the table when the load factor drops enough,
+        *       or provide a 'compact' method. */
+
+       struct entry *e;
+       struct entry **pE;
+       void *v;
+       unsigned int hashvalue, index;
+
+       hashvalue = hash(h,k);
+       index = indexFor(h->tablelength,hash(h,k));
+       pE = &(h->table[index]);
+       e = *pE;
+       while (NULL != e)
+       {
+               /* Check hash value to short circuit heavier comparison */
+               if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+               {
+                       *pE = e->next;
+                       h->entrycount--;
+                       v = e->v;
+                       freekey(e->k);
+                       free(e);
+                       return v;
+               }
+               pE = &(e->next);
+               e = e->next;
+       }
+       return NULL;
+}
+
+/*****************************************************************************/
+/* destroy */
+void
+hashtable_destroy(struct hashtable *h, int free_values)
+{
+       unsigned int i;
+       struct entry *e, *f;
+       struct entry **table = h->table;
+       if (free_values)
+       {
+               for (i = 0; i < h->tablelength; i++)
+               {
+                       e = table[i];
+                       while (NULL != e)
+                       { f = e; e = e->next; freekey(f->k); free(f->v); free(f); }
+               }
+       }
+       else
+       {
+               for (i = 0; i < h->tablelength; i++)
+               {
+                       e = table[i];
+                       while (NULL != e)
+                       { f = e; e = e->next; freekey(f->k); free(f); }
+               }
+       }
+       free(h->table);
+       free(h);
+}
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/tools/blktap2/drivers/hashtable.h b/tools/blktap2/drivers/hashtable.h
new file mode 100644 (file)
index 0000000..56ca053
--- /dev/null
@@ -0,0 +1,204 @@
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+/*
+ * There are duplicates of this code in:
+ *  - tools/xenstore/hashtable.h
+ */
+
+#ifndef __HASHTABLE_CWC22_H__
+#define __HASHTABLE_CWC22_H__
+
+struct hashtable;
+
+/* Example of use:
+ *
+ *      struct hashtable  *h;
+ *      struct some_key   *k;
+ *      struct some_value *v;
+ *
+ *      static unsigned int         hash_from_key_fn( void *k );
+ *      static int                  keys_equal_fn ( void *key1, void *key2 );
+ *
+ *      h = create_hashtable(16, hash_from_key_fn, keys_equal_fn);
+ *      k = (struct some_key *)     malloc(sizeof(struct some_key));
+ *      v = (struct some_value *)   malloc(sizeof(struct some_value));
+ *
+ *      (initialise k and v to suitable values)
+ *
+ *      if (! hashtable_insert(h,k,v) )
+ *      {     exit(-1);               }
+ *
+ *      if (NULL == (found = hashtable_search(h,k) ))
+ *      {    printf("not found!");                  }
+ *
+ *      if (NULL == (found = hashtable_remove(h,k) ))
+ *      {    printf("Not found\n");                 }
+ *
+ */
+
+/* Macros may be used to define type-safe(r) hashtable access functions, with
+ * methods specialized to take known key and value types as parameters.
+ *
+ * Example:
+ *
+ * Insert this at the start of your file:
+ *
+ * DEFINE_HASHTABLE_INSERT(insert_some, struct some_key, struct some_value);
+ * DEFINE_HASHTABLE_SEARCH(search_some, struct some_key, struct some_value);
+ * DEFINE_HASHTABLE_REMOVE(remove_some, struct some_key, struct some_value);
+ *
+ * This defines the functions 'insert_some', 'search_some' and 'remove_some'.
+ * These operate just like hashtable_insert etc., with the same parameters,
+ * but their function signatures have 'struct some_key *' rather than
+ * 'void *', and hence can generate compile time errors if your program is
+ * supplying incorrect data as a key (and similarly for value).
+ *
+ * Note that the hash and key equality functions passed to create_hashtable
+ * still take 'void *' parameters instead of 'some key *'. This shouldn't be
+ * a difficult issue as they're only defined and passed once, and the other
+ * functions will ensure that only valid keys are supplied to them.
+ *
+ * The cost for this checking is increased code size and runtime overhead
+ * - if performance is important, it may be worth switching back to the
+ * unsafe methods once your program has been debugged with the safe methods.
+ * This just requires switching to some simple alternative defines - eg:
+ * #define insert_some hashtable_insert
+ *
+ */
+
+/*****************************************************************************
+ * create_hashtable
+
+ * @name                    create_hashtable
+ * @param   minsize         minimum initial size of hashtable
+ * @param   hashfunction    function for hashing keys
+ * @param   key_eq_fn       function for determining key equality
+ * @return                  newly created hashtable or NULL on failure
+ */
+
+struct hashtable *
+create_hashtable(unsigned int minsize,
+                 unsigned int (*hashfunction) (void*),
+                 int (*key_eq_fn) (void*,void*));
+
+/*****************************************************************************
+ * hashtable_insert
+
+ * @name        hashtable_insert
+ * @param   h   the hashtable to insert into
+ * @param   k   the key - hashtable claims ownership and will free on removal
+ * @param   v   the value - does not claim ownership
+ * @return      non-zero for successful insertion
+ *
+ * This function will cause the table to expand if the insertion would take
+ * the ratio of entries to table size over the maximum load factor.
+ *
+ * This function does not check for repeated insertions with a duplicate key.
+ * The value returned when using a duplicate key is undefined -- when
+ * the hashtable changes size, the order of retrieval of duplicate key
+ * entries is reversed.
+ * If in doubt, remove before insert.
+ */
+
+int
+hashtable_insert(struct hashtable *h, void *k, void *v);
+
+#define DEFINE_HASHTABLE_INSERT(fnname, keytype, valuetype) \
+int fnname (struct hashtable *h, keytype *k, valuetype *v) \
+{ \
+    return hashtable_insert(h,k,v); \
+}
+
+/*****************************************************************************
+ * hashtable_search
+
+ * @name        hashtable_search
+ * @param   h   the hashtable to search
+ * @param   k   the key to search for  - does not claim ownership
+ * @return      the value associated with the key, or NULL if none found
+ */
+
+void *
+hashtable_search(struct hashtable *h, void *k);
+
+#define DEFINE_HASHTABLE_SEARCH(fnname, keytype, valuetype) \
+valuetype * fnname (struct hashtable *h, keytype *k) \
+{ \
+    return (valuetype *) (hashtable_search(h,k)); \
+}
+
+/*****************************************************************************
+ * hashtable_remove
+
+ * @name        hashtable_remove
+ * @param   h   the hashtable to remove the item from
+ * @param   k   the key to search for  - does not claim ownership
+ * @return      the value associated with the key, or NULL if none found
+ */
+
+void * /* returns value */
+hashtable_remove(struct hashtable *h, void *k);
+
+#define DEFINE_HASHTABLE_REMOVE(fnname, keytype, valuetype) \
+valuetype * fnname (struct hashtable *h, keytype *k) \
+{ \
+    return (valuetype *) (hashtable_remove(h,k)); \
+}
+
+
+/*****************************************************************************
+ * hashtable_count
+
+ * @name        hashtable_count
+ * @param   h   the hashtable
+ * @return      the number of items stored in the hashtable
+ */
+unsigned int
+hashtable_count(struct hashtable *h);
+
+
+/*****************************************************************************
+ * hashtable_destroy
+
+ * @name        hashtable_destroy
+ * @param   h   the hashtable
+ * @param       free_values     whether to call 'free' on the remaining values
+ */
+
+void
+hashtable_destroy(struct hashtable *h, int free_values);
+
+#endif /* __HASHTABLE_CWC22_H__ */
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/hashtable_itr.c b/tools/blktap2/drivers/hashtable_itr.c
new file mode 100644 (file)
index 0000000..731917c
--- /dev/null
@@ -0,0 +1,195 @@
+/* Copyright (C) 2002, 2004 Christopher Clark  <firstname.lastname@cl.cam.ac.uk> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include "hashtable_itr.h"
+#include <stdlib.h> /* defines NULL */
+
+struct hashtable_itr {
+       struct hashtable *h;
+       struct entry *e;
+       struct entry *parent;
+       unsigned int index;
+};
+
+/*****************************************************************************/
+/* hashtable_iterator    - iterator constructor */
+
+struct hashtable_itr *
+hashtable_iterator(struct hashtable *h)
+{
+       unsigned int i, tablelength;
+       struct hashtable_itr *itr = (struct hashtable_itr *)
+               malloc(sizeof(struct hashtable_itr));
+       if (NULL == itr) return NULL;
+       itr->h = h;
+       itr->e = NULL;
+       itr->parent = NULL;
+       tablelength = h->tablelength;
+       itr->index = tablelength;
+       if (0 == h->entrycount) return itr;
+
+       for (i = 0; i < tablelength; i++)
+       {
+               if (NULL != h->table[i])
+               {
+                       itr->e = h->table[i];
+                       itr->index = i;
+                       break;
+               }
+       }
+       return itr;
+}
+
+/*****************************************************************************/
+/* key      - return the key of the (key,value) pair at the current position */
+/* value    - return the value of the (key,value) pair at the current position */
+
+void *
+hashtable_iterator_key(struct hashtable_itr *i)
+{ return i->e->k; }
+
+void *
+hashtable_iterator_value(struct hashtable_itr *i)
+{ return i->e->v; }
+
+/*****************************************************************************/
+/* advance - advance the iterator to the next element
+ *           returns zero if advanced to end of table */
+
+int
+hashtable_iterator_advance(struct hashtable_itr *itr)
+{
+       unsigned int j,tablelength;
+       struct entry **table;
+       struct entry *next;
+       if (NULL == itr->e) return 0; /* stupidity check */
+
+       next = itr->e->next;
+       if (NULL != next)
+       {
+               itr->parent = itr->e;
+               itr->e = next;
+               return -1;
+       }
+       tablelength = itr->h->tablelength;
+       itr->parent = NULL;
+       if (tablelength <= (j = ++(itr->index)))
+       {
+               itr->e = NULL;
+               return 0;
+       }
+       table = itr->h->table;
+       while (NULL == (next = table[j]))
+       {
+               if (++j >= tablelength)
+               {
+                       itr->index = tablelength;
+                       itr->e = NULL;
+                       return 0;
+               }
+       }
+       itr->index = j;
+       itr->e = next;
+       return -1;
+}
+
+/*****************************************************************************/
+/* remove - remove the entry at the current iterator position
+ *          and advance the iterator, if there is a successive
+ *          element.
+ *          If you want the value, read it before you remove:
+ *          beware memory leaks if you don't.
+ *          Returns zero if end of iteration. */
+
+int
+hashtable_iterator_remove(struct hashtable_itr *itr)
+{
+       struct entry *remember_e, *remember_parent;
+       int ret;
+
+       /* Do the removal */
+       if (NULL == (itr->parent))
+       {
+               /* element is head of a chain */
+               itr->h->table[itr->index] = itr->e->next;
+       } else {
+               /* element is mid-chain */
+               itr->parent->next = itr->e->next;
+       }
+       /* itr->e is now outside the hashtable */
+       remember_e = itr->e;
+       itr->h->entrycount--;
+       freekey(remember_e->k);
+
+       /* Advance the iterator, correcting the parent */
+       remember_parent = itr->parent;
+       ret = hashtable_iterator_advance(itr);
+       if (itr->parent == remember_e) { itr->parent = remember_parent; }
+       free(remember_e);
+       return ret;
+}
+
+/*****************************************************************************/
+int /* returns zero if not found */
+hashtable_iterator_search(struct hashtable_itr *itr,
+                          struct hashtable *h, void *k)
+{
+       struct entry *e, *parent;
+       unsigned int hashvalue, index;
+
+       hashvalue = hash(h,k);
+       index = indexFor(h->tablelength,hashvalue);
+
+       e = h->table[index];
+       parent = NULL;
+       while (NULL != e)
+       {
+               /* Check hash value to short circuit heavier comparison */
+               if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+               {
+                       itr->index = index;
+                       itr->e = e;
+                       itr->parent = parent;
+                       itr->h = h;
+                       return -1;
+               }
+               parent = e;
+               e = e->next;
+       }
+       return 0;
+}
+
+
+/*
+ * Copyright (c) 2002, 2004, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/tools/blktap2/drivers/hashtable_itr.h b/tools/blktap2/drivers/hashtable_itr.h
new file mode 100644 (file)
index 0000000..81da838
--- /dev/null
@@ -0,0 +1,96 @@
+/* Copyright (C) 2002, 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#ifndef __HASHTABLE_ITR_CWC22__
+#define __HASHTABLE_ITR_CWC22__
+#include "hashtable.h"
+#include "hashtable_private.h" /* needed to enable inlining */
+
+struct hashtable_itr;
+
+/*****************************************************************************/
+/* hashtable_iterator
+ */
+
+struct hashtable_itr *
+hashtable_iterator(struct hashtable *h);
+
+/*****************************************************************************/
+/* hashtable_iterator_key
+ * - return the value of the (key,value) pair at the current position */
+
+void *
+hashtable_iterator_key(struct hashtable_itr *i);
+
+/*****************************************************************************/
+/* value - return the value of the (key,value) pair at the current position */
+
+void *
+hashtable_iterator_value(struct hashtable_itr *i);
+
+/*****************************************************************************/
+/* advance - advance the iterator to the next element
+ *           returns zero if advanced to end of table */
+
+int
+hashtable_iterator_advance(struct hashtable_itr *itr);
+
+/*****************************************************************************/
+/* remove - remove current element and advance the iterator to the next element
+ *          NB: if you need the value to free it, read it before
+ *          removing. ie: beware memory leaks!
+ *          returns zero if advanced to end of table */
+
+int
+hashtable_iterator_remove(struct hashtable_itr *itr);
+
+/*****************************************************************************/
+/* search - overwrite the supplied iterator, to point to the entry
+ *          matching the supplied key.
+            h points to the hashtable to be searched.
+ *          returns zero if not found. */
+int
+hashtable_iterator_search(struct hashtable_itr *itr,
+                          struct hashtable *h, void *k);
+
+#define DEFINE_HASHTABLE_ITERATOR_SEARCH(fnname, keytype) \
+int fnname (struct hashtable_itr *i, struct hashtable *h, keytype *k) \
+{ \
+    return (hashtable_iterator_search(i,h,k)); \
+}
+
+
+
+#endif /* __HASHTABLE_ITR_CWC22__*/
+
+/*
+ * Copyright (c) 2002, 2004, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/tools/blktap2/drivers/hashtable_private.h b/tools/blktap2/drivers/hashtable_private.h
new file mode 100644 (file)
index 0000000..954ecc3
--- /dev/null
@@ -0,0 +1,90 @@
+/* Copyright (C) 2002, 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+/*
+ * There are duplicates of this code in:
+ *  - tools/xenstore/hashtable_private.h
+ */
+
+#ifndef __HASHTABLE_PRIVATE_CWC22_H__
+#define __HASHTABLE_PRIVATE_CWC22_H__
+
+#include "hashtable.h"
+
+/*****************************************************************************/
+struct entry
+{
+    void *k, *v;
+    unsigned int h;
+    struct entry *next;
+};
+
+struct hashtable {
+    unsigned int tablelength;
+    struct entry **table;
+    unsigned int entrycount;
+    unsigned int loadlimit;
+    unsigned int primeindex;
+    unsigned int (*hashfn) (void *k);
+    int (*eqfn) (void *k1, void *k2);
+};
+
+/*****************************************************************************/
+unsigned int
+hash(struct hashtable *h, void *k);
+
+/*****************************************************************************/
+/* indexFor */
+static inline unsigned int
+indexFor(unsigned int tablelength, unsigned int hashvalue) {
+    return (hashvalue % tablelength);
+};
+
+/* Only works if tablelength == 2^N */
+/*static inline unsigned int
+indexFor(unsigned int tablelength, unsigned int hashvalue)
+{
+    return (hashvalue & (tablelength - 1u));
+}
+*/
+
+/*****************************************************************************/
+#define freekey(X) free(X)
+/*define freekey(X) ; */
+
+
+/*****************************************************************************/
+
+#endif /* __HASHTABLE_PRIVATE_CWC22_H__*/
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/hashtable_utility.c b/tools/blktap2/drivers/hashtable_utility.c
new file mode 100644 (file)
index 0000000..c21f6e4
--- /dev/null
@@ -0,0 +1,71 @@
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include "hashtable_utility.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/*****************************************************************************/
+/* hashtable_change
+ *
+ * function to change the value associated with a key, where there already
+ * exists a value bound to the key in the hashtable.
+ * Source due to Holger Schemel.
+ *
+ *  */
+int
+hashtable_change(struct hashtable *h, void *k, void *v)
+{
+       struct entry *e;
+       unsigned int hashvalue, index;
+       hashvalue = hash(h,k);
+       index = indexFor(h->tablelength,hashvalue);
+       e = h->table[index];
+       while (NULL != e)
+       {
+               /* Check hash value to short circuit heavier comparison */
+               if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+               {
+                       free(e->v);
+                       e->v = v;
+                       return -1;
+               }
+               e = e->next;
+       }
+       return 0;
+}
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/tools/blktap2/drivers/hashtable_utility.h b/tools/blktap2/drivers/hashtable_utility.h
new file mode 100644 (file)
index 0000000..f45b46f
--- /dev/null
@@ -0,0 +1,55 @@
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#ifndef __HASHTABLE_CWC22_UTILITY_H__
+#define __HASHTABLE_CWC22_UTILITY_H__
+
+/*****************************************************************************
+ * hashtable_change
+ *
+ * function to change the value associated with a key, where there already
+ * exists a value bound to the key in the hashtable.
+ * Source due to Holger Schemel.
+ *
+ * @name        hashtable_change
+ * @param   h   the hashtable
+ * @param       key
+ * @param       value
+ *
+ */
+int
+hashtable_change(struct hashtable *h, void *k, void *v);
+
+#endif /* __HASHTABLE_CWC22_H__ */
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/img2qcow.c b/tools/blktap2/drivers/img2qcow.c
new file mode 100644 (file)
index 0000000..7376382
--- /dev/null
@@ -0,0 +1,316 @@
+/* img2qcow.c
+ *
+ * Generates a qcow format disk and fills it from an existing image.
+ *
+ * (c) 2006 Julian Chesterfield and Andrew Warfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <zlib.h>
+#include <inttypes.h>
+#include <libaio.h>
+
+#include "bswap.h"
+#include "aes.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+#include "qcow.h"
+#include "blk.h"
+
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE    0
+#endif
+
+#define BLOCK_PROCESSSZ 4096
+#define QCOW_VBD 0
+#define PROGRESS_QUANT 2
+
+static int running = 1, complete = 0;
+static int returned_events = 0, submit_events = 0;
+static uint32_t read_idx = 0;
+td_driver_t *ddqcow;
+td_vbd_t* qcow_vbd;
+static uint64_t prev = 0, written = 0;
+static char output[(100/PROGRESS_QUANT) + 5];
+
+extern tapdisk_server_t server;
+
+
+static void print_bytes(void *ptr, int length)
+{
+  int i,k;
+  unsigned char *p = ptr;
+
+    DFPRINTF("Buf dump, length %d:\n",length);
+    for (k = 0; k < length; k++) {
+        DFPRINTF("%x",*p);
+        *p++;
+       if(k % 16 == 0) DFPRINTF("\n");
+        else if(k % 2 == 0) DFPRINTF(" ");     
+    }
+    DFPRINTF("\n");
+    return;
+}
+
+static void debug_output(uint64_t progress, uint64_t size)
+{
+  //Output progress every PROGRESS_QUANT                                  
+  uint64_t blocks = size/(100/PROGRESS_QUANT);
+
+  if (progress/blocks > prev) {
+    memcpy(output+prev+1,"=>",2);
+    prev++;
+    DFPRINTF("\r%s     %"PRIi64"%%",
+             output, (int64_t)((prev-1)*PROGRESS_QUANT));
+  }
+  return;
+}
+
+static int get_image_info(td_disk_info_t *driver, int fd)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+       uint64_t sector_size=DEFAULT_SECTOR_SIZE;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DFPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               if (blk_getimagesize(fd, &driver->size) != 0)
+                       return -EINVAL;
+
+               DFPRINTF("Image size: \n\tpre sector_shift  [%"PRIu64"]\n\tpost "
+                       "sector_shift [%"PRIu64"]\n",
+                       (uint64_t)(driver->size << SECTOR_SHIFT),
+                       (uint64_t)driver->size);
+
+               /*Get the sector size*/
+               if (!blk_getsectorsize(fd, &sector_size))
+                 driver->sector_size = sector_size;
+
+       } else {
+               /*Local file? try fstat instead*/
+               driver->size = (stat.st_size >> SECTOR_SHIFT);
+               driver->sector_size = DEFAULT_SECTOR_SIZE;
+               DFPRINTF("Image size: [%"PRIu64"]\n",
+                       (uint64_t)driver->size);
+       }
+
+       return 0;
+}
+
+void send_responses(td_request_t treq, int err)
+{
+  if (err < 0) {
+    DFPRINTF("AIO FAILURE: res [%d]!\n",err);
+    return;
+  }
+
+  returned_events++;
+
+  free(treq.buf);
+} 
+
+int main(int argc, const char *argv[])
+{
+        int ret = -1, fd, len, err;
+       struct timeval timeout;
+       uint64_t i;
+       char *buf = NULL;
+       td_request_t treq;
+        td_disk_info_t info;
+        td_vbd_request_t* vreq;
+
+       if (argc != 3) {
+               fprintf(stderr, "Qcow-utils: v1.0.0\n");
+               fprintf(stderr, "usage: %s <QCOW FILENAME> <SRC IMAGE>\n", 
+                       argv[0]);
+               exit(-1);
+       }
+
+
+       /*Open image*/
+       fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+       
+        if (fd == -1) {
+                DFPRINTF("Unable to open [%s], (err %d)!\n",argv[2],0 - errno);
+                exit(-1);
+        }
+       
+       get_image_info(&info, fd);
+
+       /*Create qcow file*/
+       ret = qcow_create(argv[1],info.size<<SECTOR_SHIFT,NULL,0);
+       
+       if (ret < 0) {
+               DFPRINTF("Unable to create QCOW file\n");
+               exit(-1);
+       } else DFPRINTF("Qcow file created: size %"PRIu64" sectors\n",
+                       (uint64_t)info.size);
+       
+        /* Open Qcow image*/
+        err = tapdisk_server_initialize();
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't initialize server instance.\n");
+          return err;
+        }
+
+        err=tapdisk_vbd_initialize(QCOW_VBD);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't initialize qcow vbd.\n");
+          return err;
+        }
+
+        qcow_vbd = tapdisk_server_get_vbd(QCOW_VBD);
+        if (!qcow_vbd) {
+          err = -ENODEV;
+          DPRINTF("qcow2raw Couldn't create qcow vbd.\n");
+          return err;
+        }
+
+        err = tapdisk_vbd_open_vdi(qcow_vbd, argv[1], DISK_TYPE_QCOW,
+                                   TAPDISK_STORAGE_TYPE_DEFAULT,
+                                   0);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't open qcow file.\n");
+          return err;
+        }
+
+        ddqcow=(tapdisk_vbd_first_image(qcow_vbd))->driver;
+
+        /*Initialise the output string*/
+        memset(output,0x20,(100/PROGRESS_QUANT)+5);
+        output[0] = '[';
+        output[(100/PROGRESS_QUANT)+2] = ']';
+        output[(100/PROGRESS_QUANT)+3] = '\0';
+        DFPRINTF("%s",output);
+
+       i = 0;
+       while (running) {
+               
+               if (!complete) {
+                       /*Read sector from image*/
+                       if (lseek(fd, i*512, SEEK_SET) == (off_t)-1) {
+                               DFPRINTF("Unable to access file offset %"PRIu64"\n",
+                                      (uint64_t)i*512);
+                               exit(-1);
+                       }
+                       
+                       if( (ret = posix_memalign((void **)&buf, 
+                                                 BLOCK_PROCESSSZ, 
+                                                 BLOCK_PROCESSSZ)) != 0) {
+                               DFPRINTF("Unable to read memalign buf (%d)\n",ret);
+                               exit(-1);                               
+                       }
+               
+                       /*We attempt to read 4k sized blocks*/
+                       len = read(fd, buf, BLOCK_PROCESSSZ);
+                       if (len < 512) {
+                               DFPRINTF("Unable to read sector %"PRIu64"\n",
+                                        (uint64_t) (i));
+                               complete = 1;
+                               continue;
+                       }
+                       
+                       len = (len >> 9);
+
+                       treq.op = TD_OP_WRITE;
+                       treq.buf = buf;
+                       treq.sec = i;
+                       treq.secs = len;
+                       treq.image = 0;
+                       treq.cb = send_responses;
+                       treq.cb_data = buf;
+                       treq.id = 0;
+                       treq.sidx = 0;
+                        vreq = calloc(1, sizeof(td_vbd_request_t));
+                       treq.private = vreq; 
+                        
+                       vreq->submitting = 1;
+                        INIT_LIST_HEAD(&vreq->next);
+                        tapdisk_vbd_move_request(treq.private,
+                                                 &qcow_vbd->pending_requests);
+
+                        ddqcow->ops->td_queue_write(ddqcow,treq);
+                        --vreq->submitting;
+
+                       submit_events++;
+
+                       i += len;
+
+                       if (i == info.size) 
+                         complete = 1;
+
+                        tapdisk_submit_all_tiocbs(&server.aio_queue);
+                       debug_output(i,info.size);
+                }
+               
+               while(returned_events != submit_events) {
+                   ret = scheduler_wait_for_events(&server.scheduler);
+                   if (ret < 0) {
+                     DFPRINTF("server wait returned %d\n", ret);
+                     sleep(2);
+                   }
+               }
+
+               if (complete && (returned_events == submit_events)) 
+                       running = 0;
+       }
+       memcpy(output+prev+1,"=",1);
+       DFPRINTF("\r%s     100%%\nTRANSFER COMPLETE\n\n", output);
+
+        ddqcow->ops->td_close(ddqcow);
+        free(ddqcow->data);
+
+       return 0;
+}
diff --git a/tools/blktap2/drivers/io-optimize.c b/tools/blktap2/drivers/io-optimize.c
new file mode 100644 (file)
index 0000000..86fd7e6
--- /dev/null
@@ -0,0 +1,671 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include "io-optimize.h"
+#include "tapdisk-log.h"
+
+#if (!defined(TEST) && defined(DEBUG))
+#define DBG(ctx, f, a...) tlog_write(TLOG_DBG, f, ##a)
+#elif defined(TEST)
+#define DBG(ctx, f, a...) printf(f, ##a)
+#else
+#define DBG(ctx, f, a...) ((void)0)
+#endif
+
+static void print_merged_iocbs(struct opioctx *ctx, 
+                              struct iocb **iocbs, int num_iocbs);
+
+void
+opio_free(struct opioctx *ctx)
+{
+       free(ctx->opios);
+       ctx->opios = NULL;
+
+       free(ctx->free_opios);
+       ctx->free_opios = NULL;
+
+       free(ctx->iocb_queue);
+       ctx->iocb_queue = NULL;
+
+       free(ctx->event_queue);
+       ctx->event_queue = NULL;
+}
+
+int
+opio_init(struct opioctx *ctx, int num_iocbs)
+{
+       int i;
+
+       memset(ctx, 0, sizeof(struct opioctx));
+
+       ctx->num_opios     = num_iocbs;
+       ctx->free_opio_cnt = num_iocbs;
+       ctx->opios         = calloc(1, sizeof(struct opio) * num_iocbs);
+       ctx->free_opios    = calloc(1, sizeof(struct opio *) * num_iocbs);
+       ctx->iocb_queue    = calloc(1, sizeof(struct iocb *) * num_iocbs);
+       ctx->event_queue   = calloc(1, sizeof(struct io_event) * num_iocbs);
+
+       if (!ctx->opios || !ctx->free_opios ||
+           !ctx->iocb_queue || !ctx->event_queue)
+               goto fail;
+
+       for (i = 0; i < num_iocbs; i++)
+               ctx->free_opios[i] = &ctx->opios[i];
+
+       return 0;
+
+ fail:
+       opio_free(ctx);
+       return -ENOMEM;
+}
+
+static inline struct opio *
+alloc_opio(struct opioctx *ctx)
+{
+       if (ctx->free_opio_cnt <= 0)
+               return NULL;
+       return ctx->free_opios[--ctx->free_opio_cnt];
+}
+
+static inline void
+free_opio(struct opioctx *ctx, struct opio *op)
+{
+       memset(op, 0, sizeof(struct opio));
+       ctx->free_opios[ctx->free_opio_cnt++] = op;
+}
+
+static inline void
+restore_iocb(struct opio *op)
+{
+       struct iocb *io = op->iocb;
+
+       io->data        = op->data;
+       io->u.c.buf     = op->buf;
+       io->u.c.nbytes  = op->nbytes;
+}
+
+static inline int
+iocb_optimized(struct opioctx *ctx, struct iocb *io)
+{
+       unsigned long iop   = (unsigned long)io->data;
+       unsigned long start = (unsigned long)ctx->opios;
+       unsigned long end   = start + (ctx->num_opios * sizeof(struct opio));
+
+       return (iop >= start && iop < end);
+}
+
+static inline int
+contiguous_sectors(struct iocb *l, struct iocb *r)
+{
+       return (l->u.c.offset + l->u.c.nbytes == r->u.c.offset);
+}
+
+static inline int
+contiguous_buffers(struct iocb *l, struct iocb *r)
+{
+       return (l->u.c.buf + l->u.c.nbytes == r->u.c.buf);
+}
+
+static inline int
+contiguous_iocbs(struct iocb *l, struct iocb *r)
+{
+       return ((l->aio_fildes == r->aio_fildes) &&
+               contiguous_sectors(l, r) &&
+               contiguous_buffers(l, r));
+}
+
+static inline void
+init_opio_list(struct opio *op)
+{
+       op->list.head = op->list.tail = op;
+}
+
+static struct opio *
+opio_iocb_init(struct opioctx *ctx, struct iocb *io)
+{
+       struct opio *op;
+
+       op = alloc_opio(ctx);
+       if (!op)
+               return NULL;
+
+       op->buf    = io->u.c.buf;
+       op->nbytes = io->u.c.nbytes;
+       op->offset = io->u.c.offset;
+       op->data   = io->data;
+       op->iocb   = io;
+       io->data   = op;
+
+       init_opio_list(op);
+
+       return op;
+}
+
+static inline struct opio *
+opio_get(struct opioctx *ctx, struct iocb *io)
+{
+       if (iocb_optimized(ctx, io))
+               return (struct opio *)io->data;
+       else
+               return opio_iocb_init(ctx, io);
+}
+
+static int
+merge_tail(struct opioctx *ctx, struct iocb *head, struct iocb *io)
+{
+       struct opio *ophead, *opio;
+
+       ophead = opio_get(ctx, head);
+       if (!ophead)
+               return -ENOMEM;
+
+       opio = opio_get(ctx, io);
+       if (!opio)
+               return -ENOMEM;
+
+       opio->head        = ophead;
+       head->u.c.nbytes += io->u.c.nbytes;
+       ophead->list.tail = ophead->list.tail->next = opio;
+       
+       return 0;
+}
+
+static int
+merge(struct opioctx *ctx, struct iocb *head, struct iocb *io)
+{
+       if (head->aio_lio_opcode != io->aio_lio_opcode)
+               return -EINVAL;
+
+       if (!contiguous_iocbs(head, io))
+               return -EINVAL;
+
+       return merge_tail(ctx, head, io);               
+}
+
+int
+io_merge(struct opioctx *ctx, struct iocb **queue, int num)
+{
+       int i, on_queue;
+       struct iocb *io, **q;
+       struct opio *ophead;
+       
+       if (!num)
+               return 0;
+
+       on_queue = 0;
+       q = ctx->iocb_queue;
+       memcpy(q, queue, num * sizeof(struct iocb *));
+
+       for (i = 1; i < num; i++) {
+               io = q[i];
+               if (merge(ctx, queue[on_queue], io) != 0)
+                       queue[++on_queue] = io;
+       }
+
+#if (defined(TEST) || defined(DEBUG))
+       print_merged_iocbs(ctx, queue, on_queue + 1);
+#endif
+
+       return ++on_queue;
+}
+
+static int
+expand_iocb(struct opioctx *ctx, struct iocb **queue, struct iocb *io)
+{
+       int idx;
+       struct opio *op, *next;
+
+       idx = 0;
+       op  = (struct opio *)io->data;
+       while (op) {
+               next = op->next;
+               restore_iocb(op);
+               queue[idx++] = op->iocb;
+               free_opio(ctx, op);
+               op   = next;
+       }
+
+       return idx;
+}
+
+int
+io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num)
+{
+       int i, on_queue;
+       struct iocb *io, **q;
+
+       if (!num)
+               return 0;
+
+       on_queue = 0;
+       q = ctx->iocb_queue;
+       memcpy(q, queue, num * sizeof(struct iocb *));
+
+       for (i = idx; i < num; i++) {
+               io = q[i];
+               if (!iocb_optimized(ctx, io))
+                       queue[on_queue++] = io;
+               else
+                       on_queue += expand_iocb(ctx, queue + on_queue, io);
+       }
+
+       return on_queue;
+}
+
+static int
+expand_event(struct opioctx *ctx,
+            struct io_event *event, struct io_event *queue, int idx)
+{
+       int err;
+       struct iocb *io;
+       struct io_event *ep;
+       struct opio *ophead, *op, *next;
+
+       io     = event->obj;
+       ophead = (struct opio *)io->data;
+       op     = ophead;
+
+       if (event->res == io->u.c.nbytes)
+               err = 0;
+       else if ((int)event->res < 0)
+               err = (int)event->res;
+       else
+               err = -EIO;
+
+       while (op) {
+               next    = op->next;
+               ep      = &queue[idx++];
+               ep->obj = op->iocb;
+               ep->res = (err ? err : op->nbytes);
+               restore_iocb(op);
+               free_opio(ctx, op);
+               op      = next;
+       }
+
+       return idx;
+}
+
+int
+io_split(struct opioctx *ctx, struct io_event *events, int num)
+{
+       int on_queue;
+       struct iocb *io;
+       struct io_event *ep, *q;
+       
+       if (!num)
+               return 0;
+
+       on_queue = 0;
+       q = ctx->event_queue;
+       memcpy(q, events, num * sizeof(struct io_event));
+
+       for (ep = q; num-- > 0; ep++) {
+               io = ep->obj;
+               if (!iocb_optimized(ctx, io))
+                       events[on_queue++] = *ep;
+               else
+                       on_queue = expand_event(ctx, ep, events, on_queue);
+       }
+
+       return on_queue;
+}
+
+/******************************************************************************
+debug print functions
+******************************************************************************/
+static inline void
+__print_iocb(struct opioctx *ctx, struct iocb *io, char *prefix)
+{
+       char *type;
+
+       type = (io->aio_lio_opcode == IO_CMD_PREAD ? "read" : "write");
+
+       DBG(ctx, "%soff: %08llx, nbytes: %04lx, buf: %p, type: %s, data: %08lx,"
+           " optimized: %d\n", prefix, io->u.c.offset, io->u.c.nbytes, 
+           io->u.c.buf, type, (unsigned long)io->data, 
+           iocb_optimized(ctx, io));
+}
+
+static char *null_prefix = "";
+#define print_iocb(ctx, io) __print_iocb(ctx, io, null_prefix)
+
+static void
+print_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs)
+{
+       int i;
+       char pref[10];
+       struct iocb *io;
+
+       DBG(ctx, "iocbs:\n");
+       for (i = 0; i < num_iocbs; i++) {
+               io = iocbs[i];
+               snprintf(pref, 10, "%d: ", i);
+               __print_iocb(ctx, io, pref);
+       }
+}
+
+static void
+print_optimized_iocbs(struct opioctx *ctx, struct opio *op, int *cnt)
+{
+       char pref[10];
+
+       while (op) {
+               snprintf(pref, 10, "  %d: ", (*cnt)++);
+               __print_iocb(ctx, op->iocb, pref);
+               op = op->next;
+       }
+}
+
+static void
+print_merged_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs)
+{
+       int i, cnt;
+       char pref[10];
+       struct iocb *io;
+       struct opio *op;
+
+       DBG(ctx, "merged iocbs:\n");
+       for (i = 0, cnt = 0; i < num_iocbs; i++) {
+               io = iocbs[i];
+               snprintf(pref, 10, "%d: ", cnt++);
+               __print_iocb(ctx, io, pref);
+
+               if (iocb_optimized(ctx, io)) {
+                       op = (struct opio *)io->data;
+                       print_optimized_iocbs(ctx, op->next, &cnt);
+               }
+       }
+}
+
+static void
+print_events(struct opioctx *ctx, struct io_event *events, int num_events)
+{
+       int i;
+       struct iocb *io;
+
+       for (i = 0; i < num_events; i++) {
+               io = events[i].obj;
+               print_iocb(ctx, io);
+       }
+}
+/******************************************************************************
+end debug print functions
+******************************************************************************/
+
+#if defined(TEST)
+
+#define hmask 0x80000000UL
+#define smask 0x40000000UL
+#define make_data(idx, is_head, sparse) \
+         (void *)((idx) | ((is_head) ? hmask : 0) | ((sparse) ? smask : 0))
+#define data_idx(data)          (int)((unsigned long)(data) & (0x0fffffff))
+#define data_is_head(data)      (((unsigned long)(data) & hmask) ? 1 : 0)
+#define data_is_sparse(data)    (((unsigned long)(data) & smask) ? 1 : 0)
+
+static void
+usage(void)
+{
+       fprintf(stderr, "usage: io_optimize [-n num_runs] "
+               "[-i num_iocbs] [-s num_secs] [-r random_seed]\n");
+       exit(-1);
+}
+
+static int xalloc_cnt, xfree_cnt;
+static inline char *
+xalloc(int size)
+{
+       char *buf = malloc(size);
+       if (!buf) {
+               fprintf(stderr, "xalloc failed\n");
+               exit(ENOMEM);
+       }
+       xalloc_cnt++;
+       return buf;
+}
+
+static inline void
+xfree(void *buf)
+{
+       free(buf);
+       xfree_cnt++;
+}
+
+static void
+randomize_iocbs(struct iocb **iocbs, int num_iocbs, int num_secs)
+{
+       int i, j;
+
+       i = 0;
+       while (i < num_iocbs) {
+               char *buf;
+               short type;
+               int segs, sparse_mem;
+               uint64_t offset, nbytes;
+               
+               type   = (random() % 10 < 5 ? IO_CMD_PREAD : IO_CMD_PWRITE);
+               offset = ((random() % num_secs) << 9);
+
+               if (random() % 10 < 4) {
+                       segs   = 1;
+                       nbytes = (((random() % 7) + 1) << 9);
+               } else {
+                       segs   = (random() % 10) + 1;
+                       nbytes = 4096;
+               }
+
+               if (i + segs > num_iocbs)
+                       segs = (num_iocbs - i);
+
+               sparse_mem = (random() % 10 < 2 ? 1 : 0);
+
+               if (sparse_mem)
+                       buf = xalloc(nbytes);
+               else
+                       buf = xalloc(segs * nbytes);
+
+               for (j = 0; j < segs; j++) {
+                       struct iocb *io    = iocbs[i + j];
+                       io->aio_lio_opcode = type;
+                       io->u.c.nbytes     = nbytes;
+                       io->u.c.offset     = offset;
+                       io->u.c.buf        = buf;
+                       offset            += nbytes;
+
+                       io->data = make_data(i + j, (j == 0), sparse_mem);
+
+                       if (j + 1 < segs && sparse_mem)
+                               buf  = xalloc(nbytes);
+                       else
+                               buf += nbytes;
+               }
+
+               i += segs;
+       }
+}
+
+static int
+simulate_io(struct iocb **iocbs, struct io_event *events, int num_iocbs)
+{
+       int i, done;
+       struct iocb *io;
+       struct io_event *ep;
+
+       if (num_iocbs > 1)
+               done = (random() % (num_iocbs - 1)) + 1;
+       else
+               done = num_iocbs;
+
+       for (i = 0; i < done; i++) {
+               io      = iocbs[i];
+               ep      = &events[i];
+               ep->obj = io;
+               ep->res = (random() % 10 < 8 ? io->u.c.nbytes : 0);
+       }
+
+       return done;
+}
+
+static inline void
+process_events(struct opioctx *ctx, 
+              struct iocb *iocb_list, struct io_event *events, int num)
+{
+       int i;
+       struct iocb *io;
+
+       for (i = 0; i < num; i++) {
+               io = events[i].obj;
+               print_iocb(ctx, io);
+               if (data_idx(io->data) != (io - iocb_list)) {
+                       printf("corrupt data! data_idx = %d, io = %d\n",
+                              data_idx(io->data), (io - iocb_list));
+                       exit(-1);
+               }
+               if (data_is_head(io->data) || data_is_sparse(io->data))
+                       xfree(io->u.c.buf);
+               memset(io, 0, sizeof(struct iocb));
+       }
+}
+
+static inline void
+init_optest(struct iocb *iocb_list, 
+           struct iocb **iocbs, struct io_event *events, int num)
+{
+       int i;
+
+       memset(iocb_list, 0, num * sizeof(struct iocb));
+       memset(events, 0, num * sizeof(struct io_event));
+
+       for (i = 0; i < num; i++)
+               iocbs[i]  = &iocb_list[i];
+}
+
+int
+main(int argc, char **argv)
+{
+       uint64_t num_secs;
+       struct opioctx ctx;
+       struct io_event *events;
+       int i, c, num_runs, num_iocbs, seed;
+       struct iocb *iocb_list, **iocbs, **ioqueue;
+
+       num_runs  = 1;
+       num_iocbs = 300;
+       seed      = time(NULL);
+       num_secs  = ((4ULL << 20) >> 9); /* 4GB disk */
+
+       while ((c = getopt(argc, argv, "n:i:s:r:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       num_runs  = atoi(optarg);
+                       break;
+               case 'i':
+                       num_iocbs = atoi(optarg);
+                       break;
+               case 's':
+                       num_secs  = strtoull(optarg, NULL, 10);
+                       break;
+               case 'r':
+                       seed      = atoi(optarg);
+                       break;
+               case 'h':
+                       usage();
+               case '?':
+                       fprintf(stderr, "Unrecognized option: -%c\n", optopt);
+                       usage();
+               }
+       }
+
+       printf("Running %d tests with %d iocbs on %llu sectors, seed = %d\n",
+              num_runs, num_iocbs, num_secs, seed);
+
+       srand(seed);
+
+       iocb_list = malloc(num_iocbs * sizeof(struct iocb));
+       iocbs     = malloc(num_iocbs * sizeof(struct iocb *));
+       events    = malloc(num_iocbs * sizeof(struct io_event));
+       
+       if (!iocb_list || !iocbs || !events || opio_init(&ctx, num_iocbs)) {
+               fprintf(stderr, "initialization failed\n");
+               exit(ENOMEM);
+       }
+
+       for (i = 0; i < num_runs; i++) {
+               int op_rem, op_done, num_split, num_events, num_done;
+
+               ioqueue = iocbs;
+               init_optest(iocb_list, ioqueue, events, num_iocbs);
+               randomize_iocbs(ioqueue, num_iocbs, num_secs);
+               print_iocbs(&ctx, ioqueue, num_iocbs);
+
+               op_done  = 0;
+               num_done = 0;
+               op_rem   = io_merge(&ctx, ioqueue, num_iocbs);
+               print_iocbs(&ctx, ioqueue, op_rem);
+               print_merged_iocbs(&ctx, ioqueue, op_rem);
+               
+               while (num_done < num_iocbs) {
+                       DBG(&ctx, "optimized remaining: %d\n", op_rem);
+
+                       DBG(&ctx, "simulating\n");
+                       num_events = simulate_io(ioqueue + op_done, events, op_rem);
+                       print_events(&ctx, events, num_events);
+
+                       DBG(&ctx, "splitting %d\n", num_events);
+                       num_split = io_split(&ctx, events, num_events);
+                       print_events(&ctx, events, num_split);
+
+                       DBG(&ctx, "processing %d\n", num_split);
+                       process_events(&ctx, iocb_list, events, num_split);
+
+                       op_rem   -= num_events;
+                       op_done  += num_events;
+                       num_done += num_split;
+               }
+
+               DBG(&ctx, "run %d: processed: %d, xallocs: %d, xfrees: %d\n", 
+                   i, num_done, xalloc_cnt, xfree_cnt);
+               if (xalloc_cnt != xfree_cnt)
+                       exit(-1);
+               xalloc_cnt = xfree_cnt = 0;
+       }
+
+       free(iocbs);
+       free(events);
+       free(iocb_list);
+       opio_free(&ctx);
+
+       return 0;
+}
+#endif
diff --git a/tools/blktap2/drivers/io-optimize.h b/tools/blktap2/drivers/io-optimize.h
new file mode 100644 (file)
index 0000000..9a0d86b
--- /dev/null
@@ -0,0 +1,68 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __IO_OPTIMIZE_H__
+#define __IO_OPTIMIZE_H__
+
+#include <libaio.h>
+
+struct opio;
+
+struct opio_list {
+       struct opio        *head;
+       struct opio        *tail;
+};
+
+struct opio {
+       char               *buf;
+       unsigned long       nbytes;
+       long long           offset;
+       void               *data;
+       struct iocb        *iocb;
+       struct io_event     event;
+       struct opio        *head;
+       struct opio        *next;
+       struct opio_list    list;
+};
+
+struct opioctx {
+       int                 num_opios;
+       int                 free_opio_cnt;
+       struct opio        *opios;
+       struct opio       **free_opios;
+       struct iocb       **iocb_queue;
+       struct io_event    *event_queue;
+};
+
+int opio_init(struct opioctx *ctx, int num_iocbs);
+void opio_free(struct opioctx *ctx);
+int io_merge(struct opioctx *ctx, struct iocb **queue, int num);
+int io_split(struct opioctx *ctx, struct io_event *events, int num);
+int io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num);
+
+#endif
diff --git a/tools/blktap2/drivers/libaio-compat.h b/tools/blktap2/drivers/libaio-compat.h
new file mode 100644 (file)
index 0000000..ca9ff45
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2010, XenSource Inc.
+ * All rights reserved.
+ *
+ * This  library is  free  software; you  can  redistribute it  and/or
+ * modify it under the terms  of the GNU Lesser General Public License
+ * as published by  the Free Software Foundation; either  version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT  ANY  WARRANTY;  without   even  the  implied  warranty  of
+ * MERCHANTABILITY or  FITNESS FOR A PARTICULAR PURPOSE.   See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should  have received a copy  of the GNU  Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * kernel 2.6.21 added eventfd(2) support, kernel 2.6.22 eventfds for
+ * aio. libaio 0.3.107 updated the header file, but few systems have
+ * it. define a custom iocb_common struct instead, and work around a
+ * potentially missing sys/eventfd.h. this header should vanish over
+ * time.
+ */
+
+#ifndef __LIBAIO_COMPAT
+#define __LIBAIO_COMPAT
+
+#include "../../config.h"
+#include <libaio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+struct __compat_io_iocb_common {
+       char             __pad_buf[8];
+       char             __pad_nbytes[8];
+       long long       offset;
+       long long       __pad3;
+       unsigned        flags;
+       unsigned        resfd;
+};
+
+static inline void __io_set_eventfd(struct iocb *iocb, int eventfd)
+{
+       struct __compat_io_iocb_common *c;
+       c = (struct __compat_io_iocb_common*)&iocb->u.c;
+       c->flags |= (1 << 0);
+       c->resfd = eventfd;
+}
+
+#ifdef HAVE_SYS_EVENTFD_H
+
+#include <sys/eventfd.h>
+
+static inline int tapdisk_sys_eventfd(int initval)
+{
+       return eventfd(initval, 0);
+}
+
+#else /* Fallback */
+#ifndef SYS_eventfd
+#ifndef __NR_eventfd
+# if defined(__alpha__)
+#  define __NR_eventfd         478
+# elif defined(__arm__)
+#  define __NR_eventfd         (__NR_SYSCALL_BASE+351)
+# elif defined(__ia64__)
+#  define __NR_eventfd         1309
+# elif defined(__i386__)
+#  define __NR_eventfd         323
+# elif defined(__m68k__)
+#  define __NR_eventfd         319
+# elif 0 && defined(__mips__)
+#  error __NR_eventfd?
+#  define __NR_eventfd         (__NR_Linux + 319)
+#  define __NR_eventfd         (__NR_Linux + 278)
+#  define __NR_eventfd         (__NR_Linux + 282)
+# elif defined(__hppa__)
+#  define __NR_eventfd         (__NR_Linux + 304)
+# elif defined(__PPC__) || defined(__powerpc64__)
+#  define __NR_eventfd         307
+# elif defined(__s390__) || defined(__s390x__)
+#  define __NR_eventfd         318
+# elif defined(__sparc__)
+#  define __NR_eventfd         313
+# elif defined(__x86_64__)
+#  define __NR_eventfd         284
+# endif
+#else
+# error __NR_eventfd?
+#endif
+#define SYS_eventfd __NR_eventfd
+#endif
+
+static inline int tapdisk_sys_eventfd(int initval)
+{
+       return syscall(SYS_eventfd, initval, 0);
+}
+#endif
+
+#endif /* __LIBAIO_COMPAT */
diff --git a/tools/blktap2/drivers/lock.c b/tools/blktap2/drivers/lock.c
new file mode 100644 (file)
index 0000000..6da5f61
--- /dev/null
@@ -0,0 +1,1000 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This module implements a "dot locking" style advisory file locking algorithm.
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <dirent.h>
+#include <limits.h>
+#include "lock.h"
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+/* format: xenlk.hostname.uuid.<xf><rw>*/
+#define LF_POSTFIX ".xenlk"
+#define LFXL_FORMAT LF_POSTFIX ".%s.%s.x%s"
+#define LFFL_FORMAT LF_POSTFIX ".%s.%s.f%s"
+#define RETRY_MAX 16
+
+#if defined(LOGS)
+#define LOG(format, args...) printf("%d: ", __LINE__); printf(format, ## args)
+#else
+#define LOG(format, args...)
+#endif
+
+/* random wait - up to .5 seconds */
+#define XSLEEP usleep(random() & 0x7ffff)
+
+typedef int (*eval_func)(char *name, int readonly);
+
+static char *create_lockfn(char *fn_to_lock)
+{
+        char *lockfn;
+    
+        /* allocate string to hold constructed lock file */
+        lockfn = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) + 1);
+        if (unlikely(!lockfn)) {
+                return 0;
+        }
+
+        /* append postfix to file to lock */
+        strcpy(lockfn, fn_to_lock);
+        strcat(lockfn, LF_POSTFIX);
+
+        return lockfn;
+}
+
+static char *create_lockfn_link(char *fn_to_lock, char *format, 
+                                char *uuid, int readonly)
+{
+        char hostname[128];
+        char *lockfn_link;
+        char *ptr;
+
+        /* get hostname */
+        if (unlikely(gethostname(hostname, sizeof(hostname)) == -1)) {
+                return 0;
+        }
+
+        /* allocate string to hold constructed lock file link */
+        lockfn_link = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) +
+                             strlen(hostname) + strlen(uuid) + 8);
+        if (unlikely(!lockfn_link)) {
+                return 0;
+        }
+
+        /* construct lock file link with specific format */
+        strcpy(lockfn_link, fn_to_lock);
+        ptr = lockfn_link + strlen(lockfn_link);
+        sprintf(ptr, format, hostname, uuid, readonly ? "r" : "w");
+
+        return lockfn_link;
+}
+
+static int NFSnormalizedStatTime(char *fn, struct stat *statnow, int *reterrno)
+{
+        int result = LOCK_OK;
+        int uniq;
+        char *buf;
+        int fd;
+        int pid = (int)getpid();
+        int clstat;
+
+        *reterrno = 0;
+
+        /* create file to normalize time */
+        srandom((int)time(0) ^ pid);
+        uniq = random() % 0xffffff;
+        buf = malloc(strlen(fn) + 24);
+        if (unlikely(!buf)) { result = LOCK_ENOMEM; goto finish; }
+
+        strcpy(buf, fn);
+        sprintf(buf + strlen(buf), ".xen%08d.tmp", uniq);
+
+        fd = open(buf, O_WRONLY | O_CREAT, 0644);
+        if (fd == -1) { *reterrno = errno; result = LOCK_EOPEN; goto finish; }
+        clstat = close(fd);
+        if (unlikely(clstat == -1)) {
+                LOG("fail on close\n");
+        }
+        if (lstat(buf, statnow) == -1) {
+                unlink(buf);
+                *reterrno = errno;
+                result = LOCK_ESTAT;
+                goto finish;
+        }
+        unlink(buf);
+
+finish:
+        return result;
+}
+
+static int writer_eval(char *name, int readonly) 
+{
+        return name[strlen(name)-1] == 'w';
+}
+
+static int reader_eval(char *name, int readonly) 
+{
+        return name[strlen(name)-1] == 'r' && !readonly;
+}
+
+static int lock_holder(char *fn, char *lockfn, char *lockfn_link, 
+                       int force, int readonly, int *stole, eval_func eval,
+                       int *elt, int *ioerror)
+{
+        int status = 0;
+        int ustat;
+        DIR *pd = 0;
+        struct dirent *dptr;
+        char *ptr;
+        char *dirname = malloc(strlen(lockfn));
+        char *uname = malloc(strlen(lockfn_link) + 8);
+        int elt_established = 0;
+        int fd;
+        char tmpbuf[4096];
+
+        *stole = 0;
+        *ioerror = 0;
+        *elt = 0;
+
+        if (!dirname) goto finish;
+        if (!uname) goto finish;
+
+        /* get directory */
+        ptr = strrchr(lockfn, '/');
+        if (!ptr) {
+                strcpy(dirname, ".");
+        } else {
+                int numbytes = ptr - lockfn;
+                strncpy(dirname, lockfn, numbytes);
+                dirname[numbytes] = '\0';
+        }
+        pd = opendir(dirname); 
+        if (!pd) {
+                *ioerror = errno ? errno : EIO;
+                goto finish;
+        }
+
+        /* 
+         * scan through directory entries and use eval function 
+         * if we have a match (i.e. reader or writer lock) but
+         * note that if we are forcing, we will remove any and
+         * all locks that appear for target of our lock, regardless
+         * if it a reader/writer owns the lock.
+         */
+        errno = 0;
+        dptr = readdir(pd);
+        if (!dptr) {
+            *ioerror = EIO;
+        }
+        while (dptr) {
+                char *p1 = strrchr(fn, '/');
+                char *p2 = strrchr(lockfn, '/');
+                char *p3 = strrchr(lockfn_link, '/');
+                if (p1) p1+=1;
+                if (p2) p2+=1;
+                if (p3) p3+=1;
+                if (strcmp(dptr->d_name, p1 ? p1 : fn) &&
+                    strcmp(dptr->d_name, p2 ? p2 : lockfn) &&
+                    strcmp(dptr->d_name, p3 ? p3 : lockfn_link) &&
+                    !strncmp(dptr->d_name, p1 ? p1 : fn, strlen(p1?p1:fn))) {
+                        strcpy(uname, dirname);
+                        strcat(uname, "/");
+                        strcat(uname, dptr->d_name);
+                        if (!elt_established) {
+                            /* read final lock file and extract lease time */
+                            fd = open(uname, O_RDONLY, 0644); 
+                            memset(tmpbuf, 0, sizeof(tmpbuf));
+                            if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+                                    *ioerror = errno;
+                                    status = 1;
+                                    close(fd);
+                                    goto finish;
+                            }
+                            close(fd);
+                            ptr = strrchr(tmpbuf, '.');
+                            if (ptr) {
+                                *elt = atoi(ptr+1);
+                                elt_established = 1;
+                            }
+                        }
+                        if (force) {
+                                ustat = unlink(uname);
+                                if (ustat == -1) {
+                                        LOG("failed to unlink %s\n", uname);
+                                }
+                                *stole = 1;
+                                *elt = 0;
+                        } else {
+                                if ((*eval)(dptr->d_name, readonly)) {
+                                        closedir(pd);
+                                        status = 1;
+                                        goto finish;
+                                }
+                        }
+                }
+                dptr = readdir(pd);
+                if (!dptr && errno) {
+                    *ioerror = EIO;
+                }
+        }
+
+        closedir(pd);
+
+finish:
+        free(dirname);
+        free(uname);
+
+        /* if IO error, force a taken status */
+        return (*ioerror) ? 1 : status;
+}
+
+int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstatus)
+{
+        char *lockfn = 0;
+        char *lockfn_xlink = 0;
+        char *lockfn_flink = 0;
+        char *buf = 0;
+        int fd;
+        int status = 0;
+        struct stat stat1, stat2;
+        int retry_attempts = 0;
+        int clstat;
+        int tmpstat;
+        int stealx = 0;
+        int stealw = 0;
+        int stealr = 0;
+        int established_lease_time = 0;
+        char tmpbuf[4096];
+        int ioerr;
+    
+        if (!fn_to_lock || !uuid) {
+                *retstatus = LOCK_EBADPARM;
+                return EINVAL;
+        }
+
+        *retstatus = 0;
+
+        /* seed random with time/pid combo */
+        srandom((int)time(0) ^ getpid());
+
+        /* build lock file strings */
+        lockfn = create_lockfn(fn_to_lock);
+        if (unlikely(!lockfn)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+        lockfn_xlink = create_lockfn_link(fn_to_lock, LFXL_FORMAT, 
+                                          uuid, readonly);
+        if (unlikely(!lockfn_xlink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+        lockfn_flink = create_lockfn_link(fn_to_lock, LFFL_FORMAT, uuid, 
+                                          readonly);
+        if (unlikely(!lockfn_flink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+try_again:
+        if (retry_attempts++ > RETRY_MAX) {
+                if (*retstatus == LOCK_EXLOCK_OPEN) {
+                        struct stat statnow, stat_exlock;
+                        int diff;
+
+                        if (lstat(lockfn, &stat_exlock) == -1) {
+                                goto finish;
+                        }
+                
+                        if (NFSnormalizedStatTime(fn_to_lock, &statnow, &ioerr)) {
+                                goto finish;
+                        }
+
+                        diff = (int)statnow.st_mtime - (int)stat_exlock.st_mtime;
+                        if (diff > DEFAULT_LEASE_TIME_SECS) {
+                                unlink(lockfn);
+                                retry_attempts = 0;
+                                goto try_again;
+                        }
+                }
+                goto finish;
+        }
+
+        /* try to open exlusive lockfile */
+        fd = open(lockfn, O_WRONLY | O_CREAT | O_EXCL, 0644); 
+        if (fd == -1) {
+                LOG("Initial lockfile creation failed %s force=%d, errno=%d\n",
+                     lockfn, force, errno);
+                if (errno == EIO) {
+                       *retstatus = LOCK_EXLOCK_OPEN;
+                       status = EIO;
+                       goto finish;
+                }
+                /* already owned? (hostname & uuid match, skip time bits) */
+                errno = 0;
+                fd = open(lockfn, O_RDWR, 0644);
+                if (fd != -1) {
+                        buf = malloc(strlen(lockfn_xlink)+1);
+                        if (!buf) {
+                                clstat = close(fd);
+                                if (unlikely(clstat == -1)) {
+                                        LOG("fail on close\n");
+                                }
+                                *retstatus = LOCK_ENOMEM;
+                                status = ENOMEM;
+                                goto finish;
+                        }
+                        if (read(fd, buf, strlen(lockfn_xlink)) !=
+                           (strlen(lockfn_xlink))) {
+                                clstat = close(fd);
+                                if (unlikely(clstat == -1)) {
+                                        LOG("fail on close\n");
+                                }
+                                free(buf);
+                                goto force_lock;
+                        }
+                        if (!strncmp(buf, lockfn_xlink, strlen(lockfn_xlink)-1)) {
+                                LOG("lock owned by us, reasserting\n");
+                                /* our lock, reassert by rewriting below */
+                                if (lseek(fd, 0, SEEK_SET) == -1) {
+                                        clstat = close(fd);
+                                        if (unlikely(clstat == -1)) {
+                                                LOG("fail on close\n");
+                                        }
+                                        goto force_lock;
+                                }
+                                free(buf);
+                                goto skip;
+                        }
+                        free(buf);
+                        clstat = close(fd);
+                        if (unlikely(clstat == -1)) {
+                                LOG("fail on close\n");
+                        }
+                }
+force_lock:
+                if (errno == EIO) {
+                       *retstatus = LOCK_EXLOCK_OPEN;
+                       status = EIO;
+                       goto finish;
+                }
+                if (force) {
+                        /* remove lock file, we are forcing lock, try again */
+                        status = unlink(lockfn);
+                        if (unlikely(status == -1)) {
+                                if (errno == EIO) {
+                                       *retstatus = LOCK_EXLOCK_OPEN;
+                                       status = EIO;
+                                       goto finish;
+                                }
+                                LOG("force removal of %s lockfile failed, "
+                                    "errno=%d, trying again\n", lockfn, errno);
+                        }
+                        stealx = 1;
+                }
+                XSLEEP;
+                *retstatus = LOCK_EXLOCK_OPEN;
+                goto try_again;
+        }
+
+        LOG("lockfile created %s\n", lockfn);
+
+skip:
+        /* 
+         * write into the temporary xlock
+         */
+        if (write(fd, lockfn_xlink, strlen(lockfn_xlink)) != 
+                strlen(lockfn_xlink)) {
+                if (errno == EIO) {
+                       *retstatus = LOCK_EXLOCK_WRITE;
+                       status = EIO;
+                       goto finish;
+                }
+                status = errno;
+                clstat = close(fd);
+                if (unlikely(clstat == -1)) {
+                        LOG("fail on close\n");
+                }
+                XSLEEP;
+                *retstatus = LOCK_EXLOCK_WRITE;
+                if (unlink(lockfn) == -1)  {
+                        LOG("removal of %s lockfile failed, "
+                            "errno=%d, trying again\n", lockfn, errno);
+                }
+                goto try_again;
+        }
+        clstat = close(fd);
+        if (unlikely(clstat == -1)) {
+                LOG("fail on close\n");
+        }
+
+        while (retry_attempts++ < RETRY_MAX) {
+                tmpstat = link(lockfn, lockfn_xlink);
+                LOG("linking %s and %s\n", lockfn, lockfn_xlink);
+                if ((tmpstat == -1) && (errno != EEXIST)) { 
+                        LOG("link status is %d, errno=%d\n", tmpstat, errno); 
+                }
+
+                if ((lstat(lockfn, &stat1) == -1) || 
+                    (lstat(lockfn_xlink, &stat2) == -1)) {
+                        /* try again, cleanup first */
+                        tmpstat = unlink(lockfn);
+                        if (unlikely(tmpstat == -1)) {
+                                LOG("error removing lock file %s", lockfn);
+                        }
+                        tmpstat = unlink(lockfn_xlink);
+                        if (unlikely(tmpstat == -1)) {
+                                LOG("error removing linked lock file %s", 
+                                    lockfn_xlink);
+                        }
+                        XSLEEP;
+                        status = LOCK_ESTAT;
+                        goto finish;
+                }
+
+                /* compare inodes */
+                if (stat1.st_ino == stat2.st_ino) {
+                        /* success, inodes are the same */
+                        /* should we check that st_nlink's are also 2?? */
+                        *retstatus = LOCK_OK;
+                        status = 0;
+                        tmpstat = unlink(lockfn_xlink);
+                        if (unlikely(tmpstat == -1)) {
+                                LOG("error removing linked lock file %s", 
+                                    lockfn_xlink);
+                        }
+                        goto finish;
+                } else {
+                       status = errno;
+                        /* try again, cleanup first */
+                        tmpstat = unlink(lockfn);
+                        if (unlikely(tmpstat == -1)) {
+                                LOG("error removing lock file %s", lockfn);
+                        }
+                        tmpstat = unlink(lockfn_xlink);
+                        if (unlikely(tmpstat == -1)) {
+                                LOG("error removing linked lock file %s", 
+                                    lockfn_xlink);
+                        }
+                        XSLEEP;
+                        *retstatus = LOCK_EINODE;
+                        goto try_again;
+                }
+        }
+
+finish:
+        if (!*retstatus) {
+
+                /* we have exclusive lock */
+
+                status = 0;
+
+                /* fast check, see if we own a final lock and are reasserting */
+                if (!lstat(lockfn_flink, &stat1)) {
+                        char *ptr;
+
+                        /* set the return value to notice this is a reassert */
+                        *retstatus = 1; 
+
+                        /* read existing lock file and extract 
+                           established lease time */
+                        fd = open(lockfn_flink, O_RDONLY, 0644); 
+                        memset(tmpbuf, 0, sizeof(tmpbuf));
+                        if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+                                if (errno == EIO) {
+                                        close(fd);
+                                        *retstatus = LOCK_EINODE;
+                                        status = EIO;
+                                        goto skip_scan;
+                                }
+                        }
+                        close(fd);
+                        ptr = strrchr(tmpbuf, '.');
+                        if (ptr) {
+                            *lease_time = atoi(ptr+1);
+                        } else {
+                            *lease_time = 10; /* wkchack */
+                        }
+                        goto skip_scan;
+                } else {
+                       if (errno == EIO) {
+                               *retstatus = LOCK_EINODE;
+                               status = EIO;
+                               goto skip_scan;
+                       }
+                }
+
+                /* we allow exclusive writer, or multiple readers */
+                if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force,
+                                     readonly, &stealw, writer_eval, 
+                                     &established_lease_time, &ioerr)) {
+                        if (ioerr) {
+                            *retstatus = LOCK_EREAD;
+                            status = ioerr;
+                            goto skip_scan;
+                        }
+                        *retstatus = LOCK_EHELD_WR;
+                } else if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force,
+                                     readonly, &stealr, reader_eval, 
+                                     &established_lease_time, &ioerr)) {
+                        if (ioerr) {
+                            *retstatus = LOCK_EREAD;
+                            status = ioerr;
+                            goto skip_scan;
+                        }
+                        *retstatus = LOCK_EHELD_RD;
+                }
+                if (established_lease_time) *lease_time = 
+                                                 established_lease_time;
+        }
+
+skip_scan:
+        if (*retstatus >= 0) {
+                /* update file, changes last modify time */
+                fd = open(lockfn_flink, O_WRONLY | O_CREAT, 0644); 
+                if (fd == -1) {
+                        *retstatus = LOCK_EOPEN;
+                        status = errno;
+                } else {
+                        char tmpbuf[32];
+                        int failed_write;
+                        memset(tmpbuf, 0, sizeof(tmpbuf));
+                        sprintf(tmpbuf, ".%d", *lease_time);
+                        failed_write = write(fd, lockfn_flink, 
+                                             strlen(lockfn_flink)) != 
+                                       strlen(lockfn_flink);
+                        if (failed_write) status = errno;
+                        failed_write |= write(fd, tmpbuf, strlen(tmpbuf)) != 
+                                       strlen(tmpbuf);
+                        if (failed_write) status = errno;
+                        if (failed_write) {
+                                clstat = close(fd);
+                                if (unlikely(clstat == -1)) {
+                                        LOG("fail on close\n");
+                                }
+                                XSLEEP;
+                                *retstatus = LOCK_EUPDATE;
+                                goto try_again;
+                        }
+                }
+                clstat = close(fd);
+                if (unlikely(clstat == -1)) {
+                        LOG("fail on close\n");
+                }
+        }
+
+        if (!*retstatus && force && (stealx || stealw || stealr)) {
+                struct timeval timeout;
+
+                /* enforce quiet time on steal */
+                timeout.tv_sec = *lease_time;
+                timeout.tv_usec = 0;
+                select(0, 0, 0, 0, &timeout);
+        }
+
+        /* remove exclusive lock, final read/write locks will hold */
+        tmpstat = unlink(lockfn);
+        if (unlikely(tmpstat == -1)) {
+                LOG("error removing exclusive lock file %s", 
+                    lockfn);
+        }
+
+        free(lockfn);
+        free(lockfn_xlink);
+        free(lockfn_flink);
+
+        /* set lease time to -1 if error, so no one is apt to use it */
+        if (*retstatus < 0) *lease_time = -1;
+
+        LOG("returning status %d, errno=%d\n", status, errno);
+        return status;
+}
+
+
+int unlock(char *fn_to_unlock, char *uuid, int readonly, int *status)
+{
+        char *lockfn_link = 0;
+        int reterrno = 0;
+
+        if (!fn_to_unlock || !uuid) {
+                *status = LOCK_EBADPARM;
+                return 0;
+        }
+
+        lockfn_link = create_lockfn_link(fn_to_unlock, LFFL_FORMAT, uuid, 
+                                         readonly);
+        if (unlikely(!lockfn_link)) { *status = LOCK_ENOMEM; goto finish; }
+
+        if (unlink(lockfn_link) == -1) {
+                LOG("error removing linked lock file %s", lockfn_link);
+                reterrno = errno;
+                *status = LOCK_ENOLOCK;
+                goto finish;
+        }
+
+        *status = LOCK_OK;
+
+finish:
+        free(lockfn_link);
+        return reterrno;
+}
+
+int lock_delta(char *fn, int *ret_lease, int *max_lease)
+{
+        int reterrno = 0;
+        DIR *pd = 0;
+        struct dirent *dptr;
+        char *ptr;
+        int result = INT_MAX;
+        struct stat statbuf, statnow;
+        char *dirname = malloc(strlen(fn));
+        char *uname = malloc(strlen(fn) + 8);
+        int elt_established = 0;
+        char *dotptr;
+        char tmpbuf[4096];
+        int fd;
+
+        if (!fn || !dirname || !uname) {
+                *ret_lease = LOCK_EBADPARM;
+                *max_lease = -1;
+                return 0;
+        }
+        
+        if (NFSnormalizedStatTime(fn, &statnow, &reterrno)) {
+                result = LOCK_ESTAT;
+                goto finish;
+        }
+
+        /* get directory */
+        ptr = strrchr(fn, '/');
+        if (!ptr) {
+                strcpy(dirname, ".");
+                ptr = fn;
+        } else {
+                int numbytes = ptr - fn;
+                strncpy(dirname, fn, numbytes);
+                ptr += 1;
+        }
+        pd = opendir(dirname); 
+        if (!pd) { reterrno = errno; goto finish; }
+
+        dptr = readdir(pd);
+        while (dptr) {
+                if (strcmp(dptr->d_name, ptr) &&
+                    !strncmp(dptr->d_name, ptr,  strlen(ptr))) {
+                        char *fpath = malloc(strlen(dptr->d_name) + 
+                                             strlen(dirname) + 2);
+                        if (!fpath) {
+                            closedir(pd);
+                            result = LOCK_ENOMEM;
+                            goto finish;
+                        }
+                        strcpy(fpath, dirname);
+                        strcat(fpath, "/");
+                        strcat(fpath, dptr->d_name);
+                        if (lstat(fpath, &statbuf) != -1) {
+                                int diff = (int)statnow.st_mtime - 
+                                           (int)statbuf.st_mtime;
+                                /* adjust diff if someone updated the lock
+                                   between now and when we created the "now"
+                                   file 
+                                 */
+                                diff = (diff < 0) ? 0 : diff;
+                                result = diff < result ? diff : result;
+                        } else {
+                            closedir(pd);
+                            reterrno = errno;
+                            goto finish;
+                        }
+
+                        if (!elt_established) {
+                            /* read final lock file and extract lease time */
+                            fd = open(fpath, O_RDONLY, 0644); 
+                            memset(tmpbuf, 0, sizeof(tmpbuf));
+                            if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+                                /* error on read? */
+                            }
+                            close(fd);
+                            dotptr = strrchr(tmpbuf, '.');
+                            if (dotptr) {
+                                *max_lease = atoi(dotptr+1);
+                                elt_established = 1;
+                            }
+                        }
+
+                        free(fpath);
+                }
+                dptr = readdir(pd);
+        }
+
+        closedir(pd);
+
+finish:
+        free(dirname);
+        free(uname);
+
+        /* returns smallest lock time, or error */
+        if (result == INT_MAX) result = LOCK_ENOLOCK;
+
+        /* set lease time to -1 if error, so no one is apt to use it */
+        if ((result < 0) || reterrno) *max_lease = -1;
+        *ret_lease = result;
+        return reterrno;
+}
+
+#if defined(TEST)
+/*
+ * the following is for sanity testing.
+ */
+
+static void usage(char *prg)
+{
+        printf("usage %s\n"
+               "    dtr <filename>]\n"
+               "    p <filename> [num iterations]\n"
+               "    u <filename> [0|1] [<uniqid>]\n"
+               "    l <filename> [0|1] [0|1] [<uniqid>] [<leasetime>]\n", prg);
+        printf("        p : perf test lock take and reassert\n");
+        printf("        d : delta lock time\n");
+        printf("        t : test the file (after random locks)\n");
+        printf("        r : random lock tests (must ^C)\n");
+        printf("        u : unlock, readonly? uniqID (default is PID)\n");
+        printf("        l : lock, readonly? force?, uniqID (default is PID), lease time\n");
+}
+
+static void test_file(char *fn)
+{
+        FILE *fptr;
+        int prev_count = 0;
+        int count, pid, time;
+
+        fptr = fopen(fn, "r");
+        if (!fptr) {
+                LOG("ERROR on file %s open, errno=%d\n", fn, errno);
+                return;
+        } 
+
+        while (!feof(fptr)) {
+                fscanf(fptr, "%d %d %d\n", &count, &pid, &time);
+                if (prev_count != count) {
+                        LOG("ERROR: prev_count=%d, count=%d, pid=%d, time=%d\n",
+                                    prev_count, count, pid, time);
+                }
+                prev_count = count + 1;
+        }
+}
+
+static void random_locks(char *fn)
+{
+        int pid = getpid();
+        int status;
+        char *filebuf = malloc(256);
+        int count = 0;
+        int dummy;
+        int clstat;
+        char uuid[12];
+        int readonly;
+        int lease = DEFAULT_LEASE_TIME_SECS;
+        int err;
+
+        /* this will never return, kill to exit */
+
+        srandom((int)time(0) ^ pid);
+
+        LOG("pid: %d using file %s\n", pid, fn);
+        sprintf(uuid, "%08d", pid);
+
+        while (1) {
+                XSLEEP;
+                readonly = random()  & 1;
+                sysstatus = lock(fn, uuid, 0, readonly, &lease, status);
+                if (status == LOCK_OK) {
+                        /* got lock, open, read, modify write close file */
+                        int fd = open(fn, O_RDWR, 0644);
+                        if (fd == -1) {
+                                LOG("pid: %d ERROR on file %s open, errno=%d\n", 
+                                    pid, fn, errno);
+                        } else {
+                            if (!readonly) {
+                                /* ugly code to read data in test format */
+                                /* format is "%d %d %d" 'count pid time' */
+                                struct stat statbuf;
+                                int bytes;
+                                status = stat(fn, &statbuf);
+                                if (status != -1) {
+                                        if (statbuf.st_size > 256) {
+                                                lseek(fd, -256, SEEK_END);
+                                        } 
+                                        memset(filebuf, 0, 256);
+                                        bytes = read(fd, filebuf, 256);
+                                        if (bytes) {
+                                                int bw = bytes-2;
+                                                while (bw && filebuf[bw]!='\n') 
+                                                        bw--;
+                                                if (!bw) bw = -1;
+                                                sscanf(&filebuf[bw+1], 
+                                                       "%d %d %d", 
+                                                       &count, &dummy, &dummy);
+                                                count += 1;
+                                        }
+                                        lseek(fd, 0, SEEK_END);
+                                        sprintf(filebuf, "%d %d %d\n", 
+                                                count, pid, (int)time(0));
+                                        write(fd, filebuf, strlen(filebuf));
+                                } else {
+                                        LOG("pid: %d ERROR on file %s stat, "
+                                            "errno=%d\n", pid, fn, errno);
+                                }
+                            }
+                            clstat = close(fd);
+                            if (unlikely(clstat == -1)) {
+                                    LOG("fail on close\n");
+                            }
+                        }
+                        XSLEEP;
+                        err = unlock(fn, uuid, readonly, &status);
+                        LOG("unlock status is %d (err=%d)\n", status, err);
+                }
+        }
+}
+
+static void perf_lock(char *fn, int loops)
+{
+    int sysstatus;
+    char buf[9];
+    int start = loops;
+    int lease = DEFAULT_LEASE_TIME_SECS;
+
+    sprintf(buf, "%08d", getpid());
+
+    while (loops--) {
+        sysstatus = lock(fn, buf, 0, 0, &lease, &status);
+        if (status < 0) {
+            printf("failed to get lock at iteration %d errno=%d\n", 
+                   start - loops, errno);
+            return;
+        }
+    }
+    unlock(fn, buf, 0, &status);
+}
+
+int main(int argc, char *argv[])
+{
+        int status;
+        char *ptr;
+        char uuid[12];
+        int force;
+        int readonly;
+        int max_lease, cur_lease;
+        int intstatus;
+        int lease = DEFAULT_LEASE_TIME_SECS;
+
+        if (argc < 3) {
+                usage(argv[0]);
+                return 0;
+        }
+
+        sprintf(uuid, "%08d", getpid());
+        ptr = uuid;
+
+        if (!strcmp(argv[1],"d")) {
+                status = lock_delta(argv[2], &cur_lease, &max_lease);
+
+                printf("lock delta for %s is %d seconds, max lease is %d\n", 
+                       argv[2], cur_lease, max_lease);
+        } else if (!strcmp(argv[1],"t")) {
+                test_file(argv[2]);
+        } else if (!strcmp(argv[1],"r")) {
+                random_locks(argv[2]);
+        } else if (!strcmp(argv[1],"p")) {
+                perf_lock(argv[2], argc < 3 ? 100000 : atoi(argv[3]));
+        } else if (!strcmp(argv[1],"l")) {
+                if (argc < 4) force = 0; else force = atoi(argv[3]);
+                if (argc < 5) readonly = 0; else readonly = atoi(argv[4]);
+                if (argc >= 6) ptr = argv[5];
+                if (argc == 7) lease = atoi(argv[6]);
+                status = lock(argv[2], ptr, readonly, force, &lease, &intstatus);
+                printf("lock status = %d\n", status);
+        } else if (!strcmp(argv[1],"u") ) {
+                if (argc < 5) readonly = 0; else readonly = atoi(argv[3]);
+                if (argc == 5) ptr = argv[4];
+                status = unlock(argv[2], ptr, readonly, &intstatus);
+                printf("unlock status = %d\n", intstatus);
+        } else {
+                usage(argv[0]);
+        }
+
+        return status;
+}
+#elif defined(UTIL)
+/*
+ * the following is used for non-libary, standalone 
+ * program utility as a shell program
+ */
+
+static void usage(char *prg)
+{
+        printf("usage %s\n"
+               "    delta <filename>\n"
+               "    unlock <filename> <r|w> <uniqid>\n"
+               "    lock <filename> <r|w> <0|1> <uniqid> <leasetime>\n", prg);
+        printf("        delta : get time since lock last refreshed\n");
+        printf("                returns delta time and max lease time in seconds\n");
+        printf("        unlock: unlock request filename, r|w,  uniqID\n");
+        printf("                returns status (success is 0)\n");
+        printf("        lock  : lock request filename,  r|w, force?, uniqID, lease time request\n");
+        printf("                returns status (success is 0) and established lease time in seconds\n");
+}
+
+int main(int argc, char *argv[])
+{
+        int status = 0;
+        int dlock;
+        char *ptr;
+        int force;
+        int readonly;
+        int cur_lease, max_lease, intstatus;
+        int lease = DEFAULT_LEASE_TIME_SECS;
+
+        if (argc < 3) {
+                if (argc == 2 && !strcmp(argv[1], "-h")) {
+                    usage(argv[0]);
+                } else {
+                    printf("%d\n", LOCK_EUSAGE);
+                }
+                return 0;
+        }
+
+        if (!strcmp(argv[1],"delta") && (argc == 3)) {
+                status = lock_delta(argv[2], &cur_lease, &max_lease);
+                printf("%d %d\n", cur_lease, max_lease);
+        } else if (!strcmp(argv[1],"lock") && (argc == 7)) {
+                readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0;
+                force = atoi(argv[4]);
+                ptr = argv[5];
+                lease = atoi(argv[6]);
+                status = lock(argv[2], ptr, force, readonly, &lease, &intstatus);
+                printf("%d %d\n", intstatus, lease);
+        } else if (!strcmp(argv[1],"unlock") && (argc == 5)) {
+                readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0;
+                ptr = argv[4];
+                status = unlock(argv[2], ptr, readonly, &intstatus);
+                printf("%d\n", intstatus);
+        } else {
+                printf("%d\n", LOCK_EUSAGE);
+        }
+
+        /* this is either 0 or a system defined errno */
+        return status;
+}
+#endif
diff --git a/tools/blktap2/drivers/lock.h b/tools/blktap2/drivers/lock.h
new file mode 100644 (file)
index 0000000..98baaaa
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define DEFAULT_LEASE_TIME_SECS 30
+
+int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstat);
+int unlock(char *fn_to_unlock, char *uuid, int readonly, int *retstat);
+int lock_delta(char *fn_to_check, int *cur_lease_time, int *max_lease_time);
+
+typedef enum {
+    LOCK_OK          =  0,
+    LOCK_EBADPARM    = -1,
+    LOCK_ENOMEM      = -2,
+    LOCK_ESTAT       = -3,
+    LOCK_EHELD_WR    = -4,
+    LOCK_EHELD_RD    = -5,
+    LOCK_EOPEN       = -6,
+    LOCK_EXLOCK_OPEN = -7,
+    LOCK_EXLOCK_WRITE= -8,
+    LOCK_EINODE      = -9,
+    LOCK_EUPDATE     = -10,
+    LOCK_EREAD       = -11,
+    LOCK_EREMOVE     = -12,
+    LOCK_ENOLOCK     = -13,
+    LOCK_EUSAGE      = -14,
+} lock_error;
diff --git a/tools/blktap2/drivers/log.h b/tools/blktap2/drivers/log.h
new file mode 100644 (file)
index 0000000..8f00df4
--- /dev/null
@@ -0,0 +1,123 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* log.h: API for writelog communication */
+
+#ifndef __LOG_H__
+#define __LOG_H__ 1
+
+#include <inttypes.h>
+
+#include <xen/io/ring.h>
+/* for wmb et al */
+#include <xenctrl.h>
+
+#define LOGCMD_SHMP  "shmp"
+#define LOGCMD_PEEK  "peek"
+#define LOGCMD_CLEAR "clrw"
+#define LOGCMD_GET   "getw"
+#define LOGCMD_KICK  "kick"
+
+#define CTLRSPLEN_SHMP  256
+#define CTLRSPLEN_PEEK  4
+#define CTLRSPLEN_CLEAR 4
+#define CTLRSPLEN_GET   4
+#define CTLRSPLEN_KICK  0
+
+/* shmregion is arbitrarily capped at 8 megs for a minimum of
+ * 64 MB of data per read (if there are no contiguous regions)
+ * In the off-chance that there is more dirty data, multiple
+ * reads must be done */
+#define SHMSIZE (8 * 1024 * 1024)
+#define SRINGSIZE 4096
+
+/* The shared memory region is split up into 3 subregions:
+ * The first half is reserved for the dirty bitmap log.
+ * The second half begins with 1 page for read request descriptors,
+ * followed by a big area for supplying read data.
+ */
+static inline void* bmstart(void* shm)
+{
+  return shm;
+}
+
+static inline void* bmend(void* shm)
+{
+  return shm + SHMSIZE/2;
+}
+
+static inline void* sringstart(void* shm)
+{
+  return bmend(shm);
+}
+
+static inline void* sdatastart(void* shm)
+{
+  return sringstart(shm) + SRINGSIZE;
+}
+
+static inline void* sdataend(void* shm)
+{
+  return shm + SHMSIZE;
+}
+
+/* format for messages between log client and server */
+struct log_ctlmsg {
+  char msg[4];
+  char params[16];
+};
+
+/* extent descriptor */
+struct disk_range {
+  uint64_t sector;
+  uint32_t count;
+};
+
+/* dirty write logging space. This is an extent ring at the front,
+ * full of disk_ranges plus a pointer into the data area */
+/* I think I'd rather have the header in front of each data section to
+ * avoid having two separate spaces that can run out, but then I'd either
+ * lose page alignment on the data blocks or spend an entire page on the
+ * header */
+
+struct log_extent {
+  uint64_t sector;
+  uint32_t count;
+  uint32_t offset; /* offset from start of data area to start of extent */
+};
+
+/* struct above should be 16 bytes, or 256 extents/page */
+
+typedef struct log_extent log_request_t;
+typedef struct log_extent log_response_t;
+
+DEFINE_RING_TYPES(log, log_request_t, log_response_t);
+
+#define LOG_HEADER_PAGES 4
+
+#endif
diff --git a/tools/blktap2/drivers/md5.c b/tools/blktap2/drivers/md5.c
new file mode 100644 (file)
index 0000000..e765832
--- /dev/null
@@ -0,0 +1,278 @@
+/* start - public domain MD5 implementation */
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ */
+
+#include <string.h>
+#include <stdint.h>
+
+struct MD5Context {
+        uint32_t buf[4];
+        uint32_t bits[2];
+        uint8_t in[64];
+};
+
+static void MD5Init(struct MD5Context *context);
+static void MD5Update(struct MD5Context *context, unsigned char const *buf,
+                          unsigned len);
+static void MD5Final(unsigned char digest[16], struct MD5Context *context);
+static void MD5Transform(uint32_t buf[4], uint32_t const in[16]);
+
+
+typedef struct MD5Context MD5_CTX;
+
+
+/**
+ * md5_sum - MD5 hash for a data block
+ * @addr: Pointers to the data area
+ * @len: Lengths of the data block
+ * @mac: Buffer for the hash
+ */
+void md5_sum(const uint8_t *addr, const size_t len, uint8_t *mac)
+{
+        MD5_CTX ctx;
+
+        MD5Init(&ctx);
+       MD5Update(&ctx, addr, len);
+        MD5Final(mac, &ctx);
+}
+
+
+#ifndef WORDS_BIGENDIAN
+#define byteReverse(buf, len)   /* Nothing */
+#else
+/*
+ * Note: this code is harmless on little-endian machines.
+ */
+static void byteReverse(unsigned char *buf, unsigned longs)
+{
+    uint32_t t;
+    do {
+        t = (uint32_t) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
+            ((unsigned) buf[1] << 8 | buf[0]);
+        *(uint32_t *) buf = t;
+        buf += 4;
+    } while (--longs);
+}
+#endif
+
+/*
+ * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(struct MD5Context *ctx)
+{
+    ctx->buf[0] = 0x67452301;
+    ctx->buf[1] = 0xefcdab89;
+    ctx->buf[2] = 0x98badcfe;
+    ctx->buf[3] = 0x10325476;
+
+    ctx->bits[0] = 0;
+    ctx->bits[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
+{
+    uint32_t t;
+
+    /* Update bitcount */
+
+    t = ctx->bits[0];
+    if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t)
+        ctx->bits[1]++;         /* Carry from low to high */
+    ctx->bits[1] += len >> 29;
+
+    t = (t >> 3) & 0x3f;        /* Bytes already in shsInfo->data */
+
+    /* Handle any leading odd-sized chunks */
+
+    if (t) {
+        unsigned char *p = (unsigned char *) ctx->in + t;
+
+        t = 64 - t;
+        if (len < t) {
+            memcpy(p, buf, len);
+            return;
+        }
+        memcpy(p, buf, t);
+        byteReverse(ctx->in, 16);
+        MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+        buf += t;
+        len -= t;
+    }
+    /* Process data in 64-byte chunks */
+
+    while (len >= 64) {
+        memcpy(ctx->in, buf, 64);
+        byteReverse(ctx->in, 16);
+        MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+        buf += 64;
+        len -= 64;
+    }
+
+    /* Handle any remaining bytes of data. */
+
+    memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(unsigned char digest[16], struct MD5Context *ctx)
+{
+    unsigned count;
+    unsigned char *p;
+
+    /* Compute number of bytes mod 64 */
+    count = (ctx->bits[0] >> 3) & 0x3F;
+
+    /* Set the first char of padding to 0x80.  This is safe since there is
+       always at least one byte free */
+    p = ctx->in + count;
+    *p++ = 0x80;
+
+    /* Bytes of padding needed to make 64 bytes */
+    count = 64 - 1 - count;
+
+    /* Pad out to 56 mod 64 */
+    if (count < 8) {
+        /* Two lots of padding:  Pad the first block to 64 bytes */
+        memset(p, 0, count);
+        byteReverse(ctx->in, 16);
+        MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+
+        /* Now fill the next block with 56 bytes */
+        memset(ctx->in, 0, 56);
+    } else {
+        /* Pad block to 56 bytes */
+        memset(p, 0, count - 8);
+    }
+    byteReverse(ctx->in, 14);
+
+    /* Append length in bits and transform */
+    ((uint32_t *) ctx->in)[14] = ctx->bits[0];
+    ((uint32_t *) ctx->in)[15] = ctx->bits[1];
+
+    MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+    byteReverse((unsigned char *) ctx->buf, 4);
+    memcpy(digest, ctx->buf, 16);
+    memset(ctx, 0, sizeof(*ctx));     /* In case it's sensitive */
+}
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, data, s) \
+        ( w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x )
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+static void MD5Transform(uint32_t buf[4], uint32_t const in[16])
+{
+    register uint32_t a, b, c, d;
+
+    a = buf[0];
+    b = buf[1];
+    c = buf[2];
+    d = buf[3];
+
+    MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+    MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+    MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+    MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+    MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+    MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+    MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+    MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+    MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+    MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+    MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+    MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+    MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+    MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+    MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+    MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+    MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+    MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+    MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+    MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+    MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+    MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+    MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+    MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+    MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+    MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+    MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+    MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+    MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+    MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+    MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+    MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+    MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+    MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+    MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+    MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+    MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+    MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+    MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+    MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+    MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+    MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+    MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+    MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+    MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+    MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+    MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+    MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+    MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+    MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+    MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+    MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+    MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+    MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+    MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+    MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+    MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+    MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+    MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+    MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+    MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+    MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+    MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+    MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+    buf[0] += a;
+    buf[1] += b;
+    buf[2] += c;
+    buf[3] += d;
+}
diff --git a/tools/blktap2/drivers/md5.h b/tools/blktap2/drivers/md5.h
new file mode 100644 (file)
index 0000000..065bd93
--- /dev/null
@@ -0,0 +1,15 @@
+#ifndef MD5_H
+#define MD5_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+/**
+ * md5_sum - MD5 hash for a data block
+ * @addr: Pointers to the data area
+ * @len: Lengths of the data block
+ * @mac: Buffer for the hash
+ */
+void md5_sum(const uint8_t *addr, const size_t len, uint8_t *mac);
+
+#endif
diff --git a/tools/blktap2/drivers/profile.h b/tools/blktap2/drivers/profile.h
new file mode 100644 (file)
index 0000000..f628ba2
--- /dev/null
@@ -0,0 +1,191 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __TAP_PROFILE_H__
+#define __TAP_PROFILE_H__
+
+#ifndef _GNU_SOURCE
+  #define _GNU_SOURCE
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <sys/time.h>
+#include <time.h>
+#include <fcntl.h>
+#include <inttypes.h>
+
+//#define PROFILING
+//#define LOGGING
+
+#define TAPPROF_IN  1
+#define TAPPROF_OUT 2
+
+struct profile_times {
+       char    *fn_name;
+       uint64_t in, out_sum, cnt;
+};
+
+struct profile_info {
+       FILE                 *log;
+       int                   size;
+       char                 *name;
+       unsigned long long    seq;
+       struct profile_times *pt;
+};
+
+#ifdef PROFILING
+
+static inline void
+tp_open(struct profile_info *prof, char *tap_name, char *log_name, int size)
+{
+       memset(prof, 0, sizeof(struct profile_info));
+#ifdef LOGGING
+       prof->log  = fopen(log_name, "w");
+#endif
+       prof->size = size;
+       prof->name = strdup(tap_name);
+       prof->pt   = malloc(sizeof(struct profile_times) * prof->size);
+       if (prof->pt)
+               memset(prof->pt, 0, sizeof(struct profile_times) * prof->size);
+}
+
+static inline void
+tp_close(struct profile_info *prof)
+{
+       int i;
+       struct profile_times *pt;
+
+       for (i = 0; i < prof->size; i++) {
+               pt = &prof->pt[i];
+               if (pt->fn_name) {
+                       syslog(LOG_DEBUG, "%s: %s: cnt: %llu, avg time: %llu\n",
+                              prof->name, pt->fn_name, pt->cnt, 
+                              ((pt->cnt) ? (pt->out_sum / pt->cnt) : 0));
+                       free(pt->fn_name);
+               }
+       }
+
+#ifdef LOGGING
+       if (prof->log)
+               fclose(prof->log);
+#endif
+       free(prof->name);
+       if (prof->pt)
+               free(prof->pt);
+}
+
+static inline u64
+tp_get_id(struct profile_info *prof)
+{
+       return prof->seq++;
+}
+
+static inline int
+tp_fn_id(struct profile_info *prof, const char *name)
+{
+       int i;
+       struct profile_times *pt;
+
+       for (i = 0; i < prof->size; i++) {
+               pt = &prof->pt[i];
+               if (!pt->fn_name)
+                       return i;
+               if (!strcmp(pt->fn_name, name))
+                       return i;
+       }
+
+       return prof->size - 1;
+}
+
+static inline void
+__tp_in(struct profile_info *prof, const char *func)
+{
+       long long _time;
+       int idx = tp_fn_id(prof, func);
+       struct profile_times *pt = &prof->pt[idx];
+
+       if (!pt->fn_name) 
+               pt->fn_name = strdup(func);
+
+       asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+       pt->in = _time;
+}
+
+#define tp_in(prof) __tp_in(prof, __func__)
+
+static inline void
+__tp_out(struct profile_info *prof, const char *func)  
+{
+       long long _time;
+       int idx = tp_fn_id(prof, func);
+       struct profile_times *pt = &prof->pt[idx];
+
+       if (!pt->fn_name || !pt->in)
+               return;
+
+       asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+       pt->cnt++;
+       pt->out_sum += (_time - pt->in);
+       pt->in       = 0;
+}
+
+#define tp_out(prof) __tp_out(prof, __func__)
+
+static inline void
+__tp_log(struct profile_info *prof, u64 id, const char *func, int direction)
+{
+       long long _time;
+       asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+
+       if (direction == TAPPROF_IN)
+               __tp_in(prof, func);
+       else 
+               __tp_out(prof, func);
+
+#ifdef LOGGING
+        if (prof->log)
+               fprintf(prof->log, "%s: %s: %llu, %lld\n", func, 
+                       ((direction == TAPPROF_IN) ? "in" : "out"), id, _time);
+#endif
+}
+
+#define tp_log(prof, id, direction) __tp_log(prof, id, __func__, direction)
+
+#else
+#define tp_open(prof, tname, lname, size)  ((void)0)
+#define tp_close(prof)                     ((void)0)
+#define tp_in(prof)                        ((void)0)
+#define tp_out(prof)                       ((void)0)
+#define tp_log(prof, sec, direction)       ((void)0)
+#endif
+
+#endif
diff --git a/tools/blktap2/drivers/qcow-create.c b/tools/blktap2/drivers/qcow-create.c
new file mode 100644 (file)
index 0000000..6a641af
--- /dev/null
@@ -0,0 +1,121 @@
+/* qcow-create.c
+ *
+ * Generates a qcow format disk.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include "tapdisk.h"
+#include "qcow.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define MAX_NAME_LEN 1000
+
+void help(void)
+{
+       fprintf(stderr, "Qcow-utils: v1.0.0\n");
+       fprintf(stderr, 
+               "usage: qcow-create [-h help] [-r reserve] <SIZE(MB)> <FILENAME> "
+               "[<BACKING_FILENAME>]\n"); 
+       exit(-1);
+}
+
+int main(int argc, char *argv[])
+{
+       int ret = -1, c, backed = 0;
+       int sparse =  1;
+       uint64_t size;
+       char filename[MAX_NAME_LEN], bfilename[MAX_NAME_LEN];
+
+        for(;;) {
+                c = getopt(argc, argv, "hr");
+                if (c == -1)
+                        break;
+                switch(c) {
+                case 'h':
+                        help();
+                        exit(0);
+                        break;
+                case 'r':
+                       sparse = 0;
+                       break;
+               default:
+                       fprintf(stderr, "Unknown option\n");
+                       help();
+               }
+       }
+
+       printf("Optind %d, argc %d\n", optind, argc);
+       if ( !(optind == (argc - 2) || optind == (argc - 3)) )
+               help();
+
+       size = atoi(argv[optind++]);
+       size = size << 20;
+
+       if (snprintf(filename, MAX_NAME_LEN, "%s",argv[optind++]) >=
+               MAX_NAME_LEN) {
+               fprintf(stderr,"Device name too long\n");
+               exit(-1);
+       }
+
+       if (optind != argc) {
+               /*Backing file argument*/
+               backed = 1;
+               if (snprintf(bfilename, MAX_NAME_LEN, "%s",argv[optind++]) >=
+                       MAX_NAME_LEN) {
+                       fprintf(stderr,"Device name too long\n");
+                       exit(-1);
+               }
+       }
+
+       DFPRINTF("Creating file size %"PRIu64", name %s\n",(uint64_t)size, filename);
+       if (!backed)
+               ret = qcow_create(filename,size,NULL,sparse);
+       else
+               ret = qcow_create(filename,size,bfilename,sparse);
+
+       if (ret < 0)
+               DPRINTF("Unable to create QCOW file\n");
+       else
+               DPRINTF("QCOW file successfully created\n");
+
+       return 0;
+}
diff --git a/tools/blktap2/drivers/qcow.h b/tools/blktap2/drivers/qcow.h
new file mode 100644 (file)
index 0000000..a88f1d5
--- /dev/null
@@ -0,0 +1,131 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _QCOW_H_
+#define _QCOW_H_
+
+#include "aes.h"
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define XEN_MAGIC  (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0x00
+#define QCOW_CRYPT_AES  0x01
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+#define SPARSE_FILE 0x01
+#define EXTHDR_L1_BIG_ENDIAN 0x02
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+#define ROUNDUP(l, s) \
+({ \
+    (uint64_t)( \
+        (l + (s - 1)) - ((l + (s - 1)) % s)); \
+})
+
+typedef struct QCowHeader {
+       uint32_t magic;
+       uint32_t version;
+       uint64_t backing_file_offset;
+       uint32_t backing_file_size;
+       uint32_t mtime;
+       uint64_t size; /* in bytes */
+       uint8_t cluster_bits;
+       uint8_t l2_bits;
+       uint32_t crypt_method;
+       uint64_t l1_table_offset;
+} QCowHeader;
+
+/*Extended header for Xen enhancements*/
+typedef struct QCowHeader_ext {
+        uint32_t xmagic;
+        uint32_t cksum;
+        uint32_t min_cluster_alloc;
+        uint32_t flags;
+} QCowHeader_ext;
+
+uint32_t gen_cksum(char *ptr, int len);
+int get_filesize(char *filename, uint64_t *size, struct stat *st);
+int qtruncate(int fd, off_t length, int sparse);
+
+#define L2_CACHE_SIZE 16  /*Fixed allocation in Qemu*/
+
+struct tdqcow_state {
+        int fd;                        /*Main Qcow file descriptor */
+       uint64_t fd_end;               /*Store a local record of file length */
+       char *name;                    /*Record of the filename*/
+       uint32_t backing_file_size;
+       uint64_t backing_file_offset;
+       uint8_t extended;              /*File contains extended header*/
+       int encrypted;                 /*File contents are encrypted or plain*/
+       int cluster_bits;              /*Determines length of cluster as 
+                                       *indicated by file hdr*/
+       int cluster_size;              /*Length of cluster*/
+       int cluster_sectors;           /*Number of sectors per cluster*/
+       int cluster_alloc;             /*Blktap fix for allocating full 
+                                       *extents*/
+       int min_cluster_alloc;         /*Blktap historical extent alloc*/
+       int sparse;                    /*Indicates whether to preserve sparseness*/
+       int l2_bits;                   /*Size of L2 table entry*/
+       int l2_size;                   /*Full table size*/
+       int l1_size;                   /*L1 table size*/
+       uint64_t cluster_offset_mask;    
+       uint64_t l1_table_offset;      /*L1 table offset from beginning of 
+                                       *file*/
+       uint64_t *l1_table;            /*L1 table entries*/
+       uint64_t *l2_cache;            /*We maintain a cache of size 
+                                       *L2_CACHE_SIZE of most read entries*/
+       uint64_t l2_cache_offsets[L2_CACHE_SIZE];     /*L2 cache entries*/
+       uint32_t l2_cache_counts[L2_CACHE_SIZE];      /*Cache access record*/
+       uint8_t *cluster_cache;          
+       uint8_t *cluster_data;
+       uint64_t cluster_cache_offset; /**/
+       uint32_t crypt_method;         /*current crypt method, 0 if no 
+                                       *key yet */
+       uint32_t crypt_method_header;  /**/
+       AES_KEY aes_encrypt_key;       /*AES key*/
+       AES_KEY aes_decrypt_key;       /*AES key*/
+
+        /* libaio state */
+       int                  aio_free_count;    
+       int                  max_aio_reqs;
+       struct qcow_request   *aio_requests;
+       struct qcow_request  **aio_free_list;
+
+};
+
+int qcow_create(const char *filename, uint64_t total_size,
+               const char *backing_file, int sparse);
+
+#endif //_QCOW_H_
diff --git a/tools/blktap2/drivers/qcow2raw.c b/tools/blktap2/drivers/qcow2raw.c
new file mode 100644 (file)
index 0000000..5ad7305
--- /dev/null
@@ -0,0 +1,443 @@
+/* qcow2raw.c
+ *
+ * Generates raw image data from an existing qcow image
+ *
+ * (c) 2006 Julian Chesterfield and Andrew Warfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+
+#include "bswap.h"
+#include "aes.h"
+#include "blk.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+#include "qcow.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+#define BLOCK_PROCESSSZ 4096
+#define QCOW_VBD 0
+#define AIO_VBD 1
+#define WINDOW 32
+#define PROGRESS_QUANT 2
+
+static int running = 1, complete = 0; 
+static int returned_read_events = 0, returned_write_events = 0;
+static int submit_events = 0;
+static uint32_t read_idx = 0;
+td_driver_t *ddqcow, *ddaio;
+td_vbd_t* qcow_vbd, *aio_vbd;
+static uint64_t prev = 0, written = 0;
+static char output[(100/PROGRESS_QUANT) + 5];
+
+extern tapdisk_server_t server;
+
+struct request_info {
+  void* buf;
+  uint64_t logical_sec;
+  int pending;
+};
+
+static void print_bytes(void *ptr, int length)
+{
+  int i,k;
+  unsigned char *p = ptr;
+
+    DFPRINTF("Buf dump, length %d:\n",length);
+    for (k = 0; k < length; k++) {
+        DFPRINTF("%x",*p);
+        *p++;
+       if (k % 16 == 0) DFPRINTF("\n");
+        else if (k % 2 == 0) DFPRINTF(" ");    
+    }
+    DFPRINTF("\n");
+    return;
+}
+
+static void debug_output(uint64_t progress, uint64_t size)
+{
+        //Output progress every PROGRESS_QUANT 
+        uint64_t blocks = size/(100/PROGRESS_QUANT);
+
+       if (progress/blocks > prev) {
+               memcpy(output+prev+1,"=>",2);
+               prev++;
+               DFPRINTF("\r%s     %"PRIu64"%%", 
+                       output, (uint64_t)((prev-1)*PROGRESS_QUANT));
+       }
+       return;
+}
+
+static void send_write_responses(td_request_t treq, int err)
+{
+        struct request_info* req;
+
+       if (err < 0) {
+               DFPRINTF("AIO FAILURE: res [%d]!\n",err);
+               return;
+       }
+       returned_write_events+=treq.secs;
+        written += treq.secs;
+
+        req= (struct request_info*)treq.cb_data;
+
+        //Wait for whole request to complete.
+        req->pending-=treq.secs;
+        if(req->pending)
+          return;
+
+        //Whole request has completed, we can free buffers. 
+        free(req->buf);
+        free(req);
+
+       debug_output(written, ddaio->info.size);
+       
+       return;
+}
+
+static void send_read_responses(td_request_t treq, int err)
+{
+       int ret;
+        struct request_info* req;
+        td_vbd_request_t* vreq;
+
+       if (err < 0)  {
+         DFPRINTF("AIO FAILURE: res [%d]!\n",err); 
+         return;
+       }
+       returned_read_events+=treq.secs;
+
+        req= (struct request_info*)treq.cb_data;
+
+        //do nothing until all fragments complete.
+        req->pending-=treq.secs;
+
+        if(req->pending)
+          return;
+
+        //This read is done.
+        tapdisk_vbd_complete_vbd_request(qcow_vbd, treq.private);
+
+
+        treq.op      = TD_OP_WRITE;
+        treq.buf     = req->buf;
+        treq.sec     = req->logical_sec;
+        treq.secs    = BLOCK_PROCESSSZ>>9;
+        treq.image   = tapdisk_vbd_first_image(aio_vbd);
+        treq.cb      = send_write_responses;
+        treq.id      = 0;
+        treq.sidx    = 0;
+
+        req->pending = BLOCK_PROCESSSZ>>9;
+        treq.cb_data = req;
+
+        vreq         = calloc(1, sizeof(td_vbd_request_t));
+        treq.private = vreq;
+
+        //Put it in the VBD's queue, so we don't lose
+        //track of it.
+        vreq->submitting = 1;
+        INIT_LIST_HEAD(&vreq->next);
+        tapdisk_vbd_move_request(treq.private, 
+                                 &aio_vbd->pending_requests);
+
+        ddaio->ops->td_queue_write(ddaio,treq);
+        --vreq->submitting;
+
+        tapdisk_submit_all_tiocbs(&server.aio_queue);
+
+       return;
+}
+
+int main(int argc, const char *argv[])
+{
+       int ret = -1, fd, len,input;
+       uint64_t size;
+       struct timeval timeout;
+       uint64_t i;
+       char *buf = NULL;
+       struct stat finfo;
+       td_request_t treq;
+       td_vbd_request_t* vreq;
+        struct request_info* req;
+        int err;
+
+       if (argc != 3) {
+               fprintf(stderr, "Qcow-utils: v1.0.0\n");
+               fprintf(stderr, "usage: %s <Dest File descriptor> "
+                       "<Qcow SRC IMAGE>\n", 
+                      argv[0]);
+               exit(-1);
+       }
+
+        err = tapdisk_server_initialize();
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't initialize server instance.\n");
+          return err;
+        }
+
+        err=tapdisk_vbd_initialize(QCOW_VBD);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't initialize qcow vbd.\n");
+          return err;
+        }
+
+        qcow_vbd = tapdisk_server_get_vbd(QCOW_VBD);
+        if (!qcow_vbd) {
+          err = -ENODEV;
+          DPRINTF("qcow2raw Couldn't create qcow vbd.\n");
+          return err;
+        }
+
+        err = tapdisk_vbd_open_vdi(qcow_vbd, argv[2], DISK_TYPE_QCOW,
+                                   TAPDISK_STORAGE_TYPE_DEFAULT,
+                                   TD_OPEN_RDONLY);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't open qcow file.\n");
+          return err;
+        }
+
+        ddqcow=(tapdisk_vbd_first_image(qcow_vbd))->driver;
+
+        /*Setup aio destination file*/
+       ret = stat(argv[1],&finfo);
+       if (ret == -1) {
+               /*Check errno*/
+               switch(errno) {
+               case ENOENT:
+                       /*File doesn't exist, create*/
+                       fd = open(argv[1], 
+                                 O_RDWR | O_LARGEFILE | O_CREAT, 0644);
+                       if (fd < 0) {
+                               DFPRINTF("ERROR creating file [%s] "
+                                        "(errno %d)\n",
+                                      argv[1], 0 - errno);
+                               exit(-1);
+                       }
+                       if (ftruncate(fd, (off_t)ddqcow->info.size<<9) < 0) {
+                               DFPRINTF("Unable to create file "
+                                       "[%s] of size %"PRIu64" (errno %d). "
+                                        "Exiting...\n",
+                                       argv[1], 
+                                       (uint64_t)ddqcow->info.size<<9, 
+                                       0 - errno);
+                               close(fd);
+                               exit(-1);
+                       }
+                       close(fd);
+                       break;
+               case  ENXIO:
+                       DFPRINTF("ERROR Device [%s] does not exist\n",argv[1]);
+                       exit(-1);
+               default: 
+                       DFPRINTF("An error occurred opening Device [%s] "
+                                "(errno %d)\n",
+                              argv[1], 0 - errno);
+                       exit(-1);
+               }
+       } else {                
+               fprintf(stderr, "WARNING: All existing data in "
+                       "%s will be overwritten.\nDo you wish to continue? "
+                       "(y or n)  ",
+                       argv[1]);
+               if (getchar() != 'y') {
+                       DFPRINTF("Exiting...\n");
+                       exit(-1);
+               }
+               
+               /*TODO - Test the existing file or device for adequate space*/
+               fd = open(argv[1], O_RDWR | O_LARGEFILE);
+               if (fd < 0) {
+                       DFPRINTF("ERROR: opening file [%s] (errno %d)\n",
+                              argv[1], 0 - errno);
+                       exit(-1);
+               }
+
+               if (S_ISBLK(finfo.st_mode)) {
+                       if (blk_getimagesize(fd, &size) != 0) {
+                               close(fd);
+                               return -1;
+                       }
+
+                       if (size < ddqcow->info.size<<9) {
+                               DFPRINTF("ERROR: Not enough space on device "
+                                       "%s (%"PRIu64" bytes available, "
+                                       "%"PRIu64" bytes required\n",
+                                       argv[1], size, 
+                                       (uint64_t)ddqcow->info.size<<9);
+                               close(fd);
+                               exit(-1);                               
+                       }
+               } else {
+                       if (ftruncate(fd, (off_t)ddqcow->info.size<<9) < 0) {
+                               DFPRINTF("Unable to create file "
+                                       "[%s] of size %"PRIu64" (errno %d). "
+                                        "Exiting...\n",
+                                       argv[1], 
+                                       (uint64_t)ddqcow->info.size<<9, 
+                                        0 - errno);
+                               close(fd);
+                               exit(-1);
+                       } else DFPRINTF("File [%s] truncated to length %"PRIu64" "
+                                       "(%"PRIu64")\n", 
+                                      argv[1], 
+                                      (uint64_t)ddqcow->info.size<<9, 
+                                      (uint64_t)ddqcow->info.size);
+               }
+               close(fd);
+       }
+
+        //Now the output file should be there, reopen it as an aio VBD
+        err=tapdisk_vbd_initialize(AIO_VBD);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't initialize aio vbd.\n");
+          return err;
+        }
+
+        aio_vbd = tapdisk_server_get_vbd(AIO_VBD);
+        if (!aio_vbd) {
+          err = -ENODEV;
+          DPRINTF("qcow2raw Couldn't create aio vbd.\n");
+          return err;
+        }
+
+        err = tapdisk_vbd_open_vdi(aio_vbd, argv[1], DISK_TYPE_AIO,
+                                   TAPDISK_STORAGE_TYPE_DEFAULT,
+                                   0);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't open aio file.\n");
+          return err;
+        }
+
+        ddaio=(tapdisk_vbd_first_image(aio_vbd))->driver;
+
+       /*Initialise the output string*/
+       memset(output,0x20,(100/PROGRESS_QUANT)+5);
+       output[0] = '[';
+        output[(100/PROGRESS_QUANT)+2] = ']';
+        output[(100/PROGRESS_QUANT)+3] = '\0';
+       DFPRINTF("%s",output);
+
+       i = 0;
+       while (running) {
+               timeout.tv_sec = 0;
+               
+               if (!complete) {
+                       /*Read Pages from qcow image*/
+                       if ( (ret = posix_memalign((void **)&buf, 
+                                                  BLOCK_PROCESSSZ, 
+                                                  BLOCK_PROCESSSZ))
+                            != 0) {
+                               DFPRINTF("Unable to alloc memory (%d)\n",ret);
+                               exit(-1);                               
+                       }
+               
+                       /*Attempt to read 4k sized blocks*/
+                       submit_events+=BLOCK_PROCESSSZ>>9;
+
+                       //Set up the read request
+                       treq.op      = TD_OP_READ;
+                       treq.buf     = buf;
+                       treq.sec     = i;
+                       treq.secs    = BLOCK_PROCESSSZ>>9;
+                       treq.image   = tapdisk_vbd_first_image(qcow_vbd);
+                       treq.cb      = send_read_responses;
+                       treq.id      = 0;
+                       treq.sidx    = 0;
+
+                        req = calloc(1, sizeof(struct request_info));
+                        req->buf = buf;
+                        req->logical_sec = i;
+                        req->pending = BLOCK_PROCESSSZ>>9;
+                       treq.cb_data = req;
+
+                        vreq         = calloc(1, sizeof(td_vbd_request_t));
+                        treq.private = vreq;
+
+                        //Put it in the VBD's queue, so we don't lose
+                        //track of it.
+                        vreq->submitting = 1;
+                        INIT_LIST_HEAD(&vreq->next);
+                        tapdisk_vbd_move_request(treq.private, 
+                                                 &qcow_vbd->pending_requests);
+
+                       ddqcow->ops->td_queue_read(ddqcow, treq);
+                        --vreq->submitting;
+
+                       i += BLOCK_PROCESSSZ>>9;
+
+                       if (i >= ddqcow->info.size)
+                         complete = 1;
+
+                       
+                       tapdisk_submit_all_tiocbs(&server.aio_queue);
+               }
+               
+
+               while(returned_write_events != submit_events) {
+                 ret = scheduler_wait_for_events(&server.scheduler);
+                 if (ret < 0) {
+                   DFPRINTF("server wait returned %d\n", ret);
+                   sleep(2);
+                 }
+               }
+               if (complete && (returned_write_events == submit_events)) 
+                       running = 0;
+       }
+       memcpy(output+prev+1,"=",1);
+       DFPRINTF("\r%s     100%%\nTRANSFER COMPLETE\n\n", output);
+
+       ddqcow->ops->td_close(ddqcow);
+       ddaio->ops->td_close(ddaio);
+       free(ddqcow->data);
+       free(ddaio->data);
+               
+       return 0;
+}
diff --git a/tools/blktap2/drivers/scheduler.c b/tools/blktap2/drivers/scheduler.c
new file mode 100644 (file)
index 0000000..6b8d009
--- /dev/null
@@ -0,0 +1,265 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "scheduler.h"
+#include "tapdisk-log.h"
+
+#define DBG(_f, _a...)               tlog_write(TLOG_DBG, _f, ##_a)
+
+#define SCHEDULER_MAX_TIMEOUT        600
+#define SCHEDULER_POLL_FD           (SCHEDULER_POLL_READ_FD |  \
+                                    SCHEDULER_POLL_WRITE_FD |  \
+                                    SCHEDULER_POLL_EXCEPT_FD)
+
+#define MIN(a, b)                   ((a) <= (b) ? (a) : (b))
+#define MAX(a, b)                   ((a) >= (b) ? (a) : (b))
+
+#define scheduler_for_each_event(s, event, tmp)        \
+       list_for_each_entry_safe(event, tmp, &(s)->events, next)
+
+typedef struct event {
+       char                         mode;
+       event_id_t                   id;
+
+       int                          fd;
+       int                          timeout;
+       int                          deadline;
+
+       event_cb_t                   cb;
+       void                        *private;
+
+       struct list_head             next;
+} event_t;
+
+static void
+scheduler_prepare_events(scheduler_t *s)
+{
+       int diff;
+       struct timeval now;
+       event_t *event, *tmp;
+
+       FD_ZERO(&s->read_fds);
+       FD_ZERO(&s->write_fds);
+       FD_ZERO(&s->except_fds);
+
+       s->max_fd  = 0;
+       s->timeout = SCHEDULER_MAX_TIMEOUT;
+
+       gettimeofday(&now, NULL);
+
+       scheduler_for_each_event(s, event, tmp) {
+               if (event->mode & SCHEDULER_POLL_READ_FD) {
+                       FD_SET(event->fd, &s->read_fds);
+                       s->max_fd = MAX(event->fd, s->max_fd);
+               }
+
+               if (event->mode & SCHEDULER_POLL_WRITE_FD) {
+                       FD_SET(event->fd, &s->write_fds);
+                       s->max_fd = MAX(event->fd, s->max_fd);
+               }
+
+               if (event->mode & SCHEDULER_POLL_EXCEPT_FD) {
+                       FD_SET(event->fd, &s->except_fds);
+                       s->max_fd = MAX(event->fd, s->max_fd);
+               }
+
+               if (event->mode & SCHEDULER_POLL_TIMEOUT) {
+                       diff = event->deadline - now.tv_sec;
+                       if (diff > 0)
+                               s->timeout = MIN(s->timeout, diff);
+                       else
+                               s->timeout = 0;
+               }
+       }
+
+       s->timeout = MIN(s->timeout, s->max_timeout);
+}
+
+static void
+scheduler_event_callback(event_t *event, char mode)
+{
+       if (event->mode & SCHEDULER_POLL_TIMEOUT) {
+               struct timeval now;
+               gettimeofday(&now, NULL);
+               event->deadline = now.tv_sec + event->timeout;
+       }
+
+       event->cb(event->id, mode, event->private);
+}
+
+static void
+scheduler_run_events(scheduler_t *s)
+{
+       struct timeval now;
+       event_t *event, *tmp;
+
+       gettimeofday(&now, NULL);
+
+ again:
+       s->restart = 0;
+
+       scheduler_for_each_event(s, event, tmp) {
+               if ((event->mode & SCHEDULER_POLL_READ_FD) &&
+                   FD_ISSET(event->fd, &s->read_fds)) {
+                       FD_CLR(event->fd, &s->read_fds);
+                       scheduler_event_callback(event, SCHEDULER_POLL_READ_FD);
+                       goto next;
+               }
+
+               if ((event->mode & SCHEDULER_POLL_WRITE_FD) &&
+                   FD_ISSET(event->fd, &s->write_fds)) {
+                       FD_CLR(event->fd, &s->write_fds);
+                       scheduler_event_callback(event, SCHEDULER_POLL_WRITE_FD);
+                       goto next;
+               }
+
+               if ((event->mode & SCHEDULER_POLL_EXCEPT_FD) &&
+                   FD_ISSET(event->fd, &s->except_fds)) {
+                       FD_CLR(event->fd, &s->except_fds);
+                       scheduler_event_callback(event, SCHEDULER_POLL_EXCEPT_FD);
+                       goto next;
+               }
+
+               if ((event->mode & SCHEDULER_POLL_TIMEOUT) &&
+                   (event->deadline <= now.tv_sec))
+                   scheduler_event_callback(event, SCHEDULER_POLL_TIMEOUT);
+
+       next:
+               if (s->restart)
+                       goto again;
+       }
+}
+
+int
+scheduler_register_event(scheduler_t *s, char mode, int fd,
+                        int timeout, event_cb_t cb, void *private)
+{
+       event_t *event;
+       struct timeval now;
+
+       if (!cb)
+               return -EINVAL;
+
+       if (!(mode & SCHEDULER_POLL_TIMEOUT) && !(mode & SCHEDULER_POLL_FD))
+               return -EINVAL;
+
+       event = calloc(1, sizeof(event_t));
+       if (!event)
+               return -ENOMEM;
+
+       gettimeofday(&now, NULL);
+
+       INIT_LIST_HEAD(&event->next);
+
+       event->mode     = mode;
+       event->fd       = fd;
+       event->timeout  = timeout;
+       event->deadline = now.tv_sec + timeout;
+       event->cb       = cb;
+       event->private  = private;
+       event->id       = s->uuid++;
+
+       if (!s->uuid)
+               s->uuid++;
+
+       list_add_tail(&event->next, &s->events);
+
+       return event->id;
+}
+
+void
+scheduler_unregister_event(scheduler_t *s, event_id_t id)
+{
+       event_t *event, *tmp;
+
+       if (!id)
+               return;
+
+       scheduler_for_each_event(s, event, tmp)
+               if (event->id == id) {
+                       list_del(&event->next);
+                       free(event);
+                       s->restart = 1;
+                       break;
+               }
+}
+
+void
+scheduler_set_max_timeout(scheduler_t *s, int timeout)
+{
+       if (timeout >= 0)
+               s->max_timeout = MIN(s->max_timeout, timeout);
+}
+
+int
+scheduler_wait_for_events(scheduler_t *s)
+{
+       int ret;
+       struct timeval tv;
+
+       scheduler_prepare_events(s);
+
+       tv.tv_sec  = s->timeout;
+       tv.tv_usec = 0;
+
+       DBG("timeout: %d, max_timeout: %d\n",
+           s->timeout, s->max_timeout);
+
+       ret = select(s->max_fd + 1, &s->read_fds,
+                    &s->write_fds, &s->except_fds, &tv);
+
+       s->restart     = 0;
+       s->timeout     = SCHEDULER_MAX_TIMEOUT;
+       s->max_timeout = SCHEDULER_MAX_TIMEOUT;
+
+       if (ret < 0)
+               return ret;
+
+       scheduler_run_events(s);
+
+       return ret;
+}
+
+void
+scheduler_initialize(scheduler_t *s)
+{
+       memset(s, 0, sizeof(scheduler_t));
+
+       s->uuid = 1;
+
+       FD_ZERO(&s->read_fds);
+       FD_ZERO(&s->write_fds);
+       FD_ZERO(&s->except_fds);
+
+       INIT_LIST_HEAD(&s->events);
+}
diff --git a/tools/blktap2/drivers/scheduler.h b/tools/blktap2/drivers/scheduler.h
new file mode 100644 (file)
index 0000000..ea37e8f
--- /dev/null
@@ -0,0 +1,65 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _SCHEDULER_H_
+#define _SCHEDULER_H_
+
+#include <sys/select.h>
+
+#include "list.h"
+
+#define SCHEDULER_POLL_READ_FD       0x1
+#define SCHEDULER_POLL_WRITE_FD      0x2
+#define SCHEDULER_POLL_EXCEPT_FD     0x4
+#define SCHEDULER_POLL_TIMEOUT       0x8
+
+typedef int                          event_id_t;
+typedef void (*event_cb_t)          (event_id_t id, char mode, void *private);
+
+typedef struct scheduler {
+       fd_set                       read_fds;
+       fd_set                       write_fds;
+       fd_set                       except_fds;
+
+       struct list_head             events;
+
+       int                          uuid;
+       int                          max_fd;
+       int                          timeout;
+       int                          restart;
+       int                          max_timeout;
+} scheduler_t;
+
+void scheduler_initialize(scheduler_t *);
+event_id_t scheduler_register_event(scheduler_t *, char mode,
+                                   int fd, int timeout,
+                                   event_cb_t cb, void *private);
+void scheduler_unregister_event(scheduler_t *,  event_id_t);
+void scheduler_set_max_timeout(scheduler_t *, int);
+int scheduler_wait_for_events(scheduler_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-client.c b/tools/blktap2/drivers/tapdisk-client.c
new file mode 100644 (file)
index 0000000..c9bda85
--- /dev/null
@@ -0,0 +1,496 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* client harness for tapdisk log */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "log.h"
+
+#define BDPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a)
+
+#define BWPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a)
+
+struct writelog {
+  char* shmpath;
+  uint32_t shmsize;
+  void* shm;
+
+  /* next unprocessed item in the writelog */
+  void* cur;
+  unsigned int inflight;
+
+  /* pointer to start and end of free data space for requests */
+  void* dhd;
+  void* dtl;
+
+  log_sring_t* sring;
+  log_front_ring_t fring;
+};
+
+/* bytes free on the data ring */
+static inline unsigned int dring_avail(struct writelog* wl)
+{
+  /* one byte reserved to distinguish empty from full */
+  if (wl->dhd == wl->dtl)
+    return sdataend(wl->shm) - sdatastart(wl->shm) - 1;
+
+  if (wl->dhd < wl->dtl)
+    return wl->dtl - wl->dhd - 1;
+
+  return (sdataend(wl->shm) - wl->dhd) + (wl->dtl - sdatastart(wl->shm)) - 1;
+}
+
+/* advance ring pointer by len bytes */
+static inline void* dring_advance(struct writelog* wl, void* start, size_t len)
+{
+  void* next;
+  int dsz = sdataend(wl->shm) - sdatastart(wl->shm);
+
+  next = start + (len % dsz);
+  if (next > sdataend(wl->shm))
+    next -= dsz;
+
+  return next;
+}
+
+static void usage(void)
+{
+  fprintf(stderr, "usage: tapdisk-client <sock>\n");
+}
+
+/* returns socket file descriptor */
+static int tdctl_open(const char* sockpath)
+{
+  struct sockaddr_un saddr;
+  int fd;
+
+  if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+    BWPRINTF("error creating socket: %s", strerror(errno));
+    return -1;
+  }
+
+  memset(&saddr, 0, sizeof(saddr));
+  saddr.sun_family = AF_UNIX;
+  memcpy(saddr.sun_path, sockpath, strlen(sockpath));
+
+  if (connect(fd, (struct sockaddr *)&saddr, sizeof(saddr)) < 0) {
+    BWPRINTF("error connecting to socket %s: %s", sockpath, strerror(errno));
+    close(fd);
+    return -1;
+  }
+
+  return fd;
+}
+
+static int ctl_talk(int fd, struct log_ctlmsg* msg, char* rsp, int rsplen)
+{
+  int rc;
+
+  if ((rc = write(fd, msg, sizeof(*msg))) < 0) {
+    BWPRINTF("error sending ctl request: %s", strerror(errno));
+    return -1;
+  } else if (rc < sizeof(*msg)) {
+    BWPRINTF("short ctl write (%d/%zd bytes)", rc, sizeof(*msg));
+    return -1;
+  }
+
+  if (!rsplen)
+    return 0;
+
+  if ((rc = read(fd, rsp, rsplen)) < 0) {
+    BWPRINTF("error reading ctl response: %s", strerror(errno));
+    return -1;
+  } else if (rc < rsplen) {
+    BWPRINTF("short ctl read (%d/%d bytes)", rc, rsplen);
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_get_shmem(int fd, struct writelog* wl)
+{
+  struct log_ctlmsg req;
+  char rsp[CTLRSPLEN_SHMP + 1];
+  int rc;
+
+  memset(&req, 0, sizeof(req));
+  memset(rsp, 0, sizeof(rsp));
+
+  memcpy(req.msg, LOGCMD_SHMP, 4);
+  if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_SHMP)) < 0) {
+    BWPRINTF("error getting shared memory parameters");
+    return -1;
+  }
+
+  memcpy(&wl->shmsize, rsp, sizeof(wl->shmsize));
+  wl->shmpath = strdup(rsp + sizeof(wl->shmsize));
+
+  BDPRINTF("shared memory parameters: size: %u, path: %s",
+          wl->shmsize, wl->shmpath);
+
+  return 0;
+}
+
+static void ctlmsg_init(struct log_ctlmsg* msg, const char* cmd)
+{
+  memset(msg, 0, sizeof(*msg));
+  memcpy(msg->msg, cmd, 4);
+}
+
+static int ctl_get_writes(int fd)
+{
+  struct log_ctlmsg req;
+  char rsp[CTLRSPLEN_GET];
+  int rc;
+
+  ctlmsg_init(&req, LOGCMD_GET);
+
+  if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_GET)) < 0) {
+    BWPRINTF("error getting writes");
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_peek_writes(int fd)
+{
+  struct log_ctlmsg req;
+  char rsp[CTLRSPLEN_PEEK];
+  int rc;
+
+  ctlmsg_init(&req, LOGCMD_PEEK);
+
+  if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_PEEK)) < 0) {
+    BWPRINTF("error peeking writes");
+    return -1;
+  }
+
+  return 0;
+}
+
+/* submit pending requests */
+static int ctl_kick(int fd)
+{
+  struct log_ctlmsg req;
+  int rc;
+
+  ctlmsg_init(&req, LOGCMD_KICK);
+
+  if ((rc = ctl_talk(fd, &req, NULL, 0)) < 0) {
+    BWPRINTF("error kicking ring");
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_clear_writes(int fd)
+{
+  struct log_ctlmsg req;
+  char rsp[CTLRSPLEN_CLEAR];
+  int rc;
+
+  ctlmsg_init(&req, LOGCMD_CLEAR);
+
+  if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_CLEAR)) < 0) {
+    BWPRINTF("error clearing writes");
+    return -1;
+  }
+
+  return 0;
+}
+
+static int writelog_map(struct writelog* wl)
+{
+  int fd;
+  void* shm;
+
+  if ((fd = shm_open(wl->shmpath, O_RDWR, 0750)) < 0) {
+    BWPRINTF("could not open shared memory at %s: %s", wl->shmpath,
+            strerror(errno));
+    return -1;
+  }
+
+  wl->shm = mmap(NULL, wl->shmsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+  close(fd);
+  if (wl->shm == MAP_FAILED) {
+    BWPRINTF("could not mmap write log shm: %s", strerror(errno));
+    return -1;
+  }
+  wl->cur = wl->shm;
+  wl->inflight = 0;
+  wl->dhd = wl->dtl = sdatastart(wl->shm);
+
+  BDPRINTF("shm cookie: 0x%x, data size: %u", *((uint32_t*)wl->shm),
+          dring_avail(wl));
+
+  wl->sring = sringstart(wl->shm);
+  /* need some thought about what to do on reconnect */
+  FRONT_RING_INIT(&wl->fring, wl->sring, SRINGSIZE);
+
+  return 0;
+}
+
+static int writelog_dump(struct writelog* wl)
+{
+  struct disk_range* range = wl->shm;
+
+  for (range = wl->shm; (void*)range < bmend(wl->shm); range++) {
+    if (!range->count)
+      break;
+
+    BDPRINTF("dirty extent: %"PRIu64":%u",
+            range->sector, range->count);
+  }
+
+  return 0;
+}
+
+/* walk dirty map and enqueue read requests.
+ * returns:  0 when entire bitmap has been enqueued,
+ *           1 when the ring is full
+ *          -1 on error
+ */
+static int writelog_enqueue_requests(struct writelog* wl)
+{
+  struct disk_range* range = wl->shm;
+  log_request_t* req;
+
+  for (range = wl->cur; (void*)range < bmend(wl->shm); range++) {
+    if (!range->count)
+      break;
+
+    if (RING_FULL(&wl->fring))
+       break;
+
+    /* insert range into request stream */
+    /* 1. get next request slot from ring */
+    /* 2. ensure enough shm space is available */
+    
+    BDPRINTF("enqueueing dirty extent: %"PRIu64":%u (ring space: %d/%d)",
+            range->sector, range->count, RING_FREE_REQUESTS(&wl->fring),
+            RING_SIZE(&wl->fring));
+
+    req = RING_GET_REQUEST(&wl->fring, wl->fring.req_prod_pvt);
+
+    req->sector = range->sector;
+    req->count = range->count;
+    /* ... */
+    req->offset = 0;
+
+    wl->fring.req_prod_pvt++;
+    wl->inflight++;
+  }
+
+  wl->cur = range;
+
+  if (range->count)
+    return 1;
+
+  return 0;
+}
+
+static int writelog_dequeue_responses(struct writelog* wl)
+{
+  RING_IDX rstart, rend;
+  log_response_t rsp;
+
+  rstart = wl->fring.rsp_cons;
+  rend = wl->sring->rsp_prod;
+
+  BDPRINTF("ring kicked (start = %u, end = %u)", rstart, rend);
+
+  while (rstart != rend) {
+    memcpy(&rsp, RING_GET_RESPONSE(&wl->fring, rstart), sizeof(rsp));
+    BDPRINTF("ctl: read response %"PRIu64":%u", rsp.sector, rsp.count);
+    wl->fring.rsp_cons = ++rstart;
+    wl->inflight--;
+  }
+
+  return 0;
+}
+
+static int writelog_free(struct writelog* wl)
+{
+  if (wl->shmpath) {
+    free(wl->shmpath);
+    wl->shmpath = NULL;
+  }
+  if (wl->shm) {
+    munmap(wl->shm, wl->shmsize);
+    wl->shm = NULL;
+  }
+
+  return 0;
+}
+
+int get_writes(struct writelog* wl, int fd, int peek)
+{
+  int rc;
+
+  if (peek)
+    rc = ctl_peek_writes(fd);
+  else
+    rc = ctl_get_writes(fd);
+
+  if (rc < 0)
+    return rc;
+
+  wl->cur = wl->shm;
+
+  return 0;
+}
+
+int await_responses(struct writelog* wl, int fd)
+{
+  struct log_ctlmsg msg;
+  int rc;
+
+  /* sit on socket waiting for kick */
+  if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
+    BWPRINTF("error reading from control socket: %s", strerror(errno));
+    return -1;
+  } else if (!rc) {
+    BWPRINTF("EOF on control socket");
+    return -1;
+  } else if (rc < sizeof(msg)) {
+         BWPRINTF("short reply (%d/%d bytes)", rc, (int) sizeof(msg));
+    return -1;
+  }
+
+  if (strncmp(msg.msg, LOGCMD_KICK, 4)) {
+    BWPRINTF("Unknown message received: %.4s", msg.msg);
+    return -1;
+  }
+
+  if (writelog_dequeue_responses(wl) < 0)
+    return -1;
+
+  return 0;
+}
+
+/* read_loop:
+ * 1. extract dirty bitmap
+ * 2. feed as much as possible onto ring
+ * 3. kick
+ * 4. as responses come back, feed more of the dirty bitmap
+ *    into the ring
+ * 5. when entire bitmap has been queued, go to 1?
+ */
+int read_loop(struct writelog* wl, int fd)
+{
+  int rc;
+
+  if (get_writes(wl, fd, 1) < 0)
+    return -1;
+  writelog_dump(wl);
+
+  do {
+    rc = writelog_enqueue_requests(wl);
+
+    if (RING_FREE_REQUESTS(&wl->fring) < RING_SIZE(&wl->fring))
+      RING_PUSH_REQUESTS(&wl->fring);
+    if (ctl_kick(fd) < 0)
+      return -1;
+
+    /* collect responses */
+    if (wl->inflight && await_responses(wl, fd) < 0)
+      return -1;
+  } while (rc > 0);
+
+  return rc;
+}
+
+int main(int argc, char* argv[])
+{
+  int fd;
+  struct writelog wl;
+  char cmd;
+
+  if (argc < 2) {
+    usage();
+    return 1;
+  }
+
+  if (argc < 3)
+    cmd = 'p';
+  else
+    cmd = argv[2][0];
+    
+  fd = tdctl_open(argv[1]);
+
+  if (ctl_get_shmem(fd, &wl) < 0)
+    return 1;
+
+  if (writelog_map(&wl) < 0) {
+    BWPRINTF("Error mapping write log: %s", strerror(errno));
+    return 1;
+  }
+
+  switch (cmd) {
+  case 'p':
+    if (get_writes(&wl, fd, 1) < 0)
+      return 1;
+    writelog_dump(&wl);
+    break;
+  case 'c':
+    if (ctl_clear_writes(fd) < 0)
+      return 1;
+    break;
+  case 'g':
+    if (get_writes(&wl, fd, 0) < 0)
+      return 1;
+    writelog_dump(&wl);
+    break;
+  case 'r':
+    if (read_loop(&wl, fd) < 0)
+      return 1;
+    break;
+  default:
+    usage();
+    return 1;
+  }
+
+  writelog_free(&wl);
+  close(fd);
+
+  return 0;
+}
diff --git a/tools/blktap2/drivers/tapdisk-control.c b/tools/blktap2/drivers/tapdisk-control.c
new file mode 100644 (file)
index 0000000..0b5cf3c
--- /dev/null
@@ -0,0 +1,837 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include "list.h"
+#include "tapdisk.h"
+#include "blktap2.h"
+#include "blktaplib.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+#include "tapdisk-message.h"
+#include "tapdisk-disktype.h"
+
+struct tapdisk_control {
+       char              *path;
+       int                socket;
+       int                event_id;
+};
+
+struct tapdisk_control_connection {
+       int                socket;
+       event_id_t         event_id;
+};
+
+static struct tapdisk_control td_control;
+
+static void
+tapdisk_control_initialize(void)
+{
+       td_control.socket   = -1;
+       td_control.event_id = -1;
+
+       signal(SIGPIPE, SIG_IGN);
+}
+
+void
+tapdisk_control_close(void)
+{
+       if (td_control.path) {
+               unlink(td_control.path);
+               free(td_control.path);
+               td_control.path = NULL;
+       }
+
+       if (td_control.socket != -1) {
+               close(td_control.socket);
+               td_control.socket = -1;
+       }
+}
+
+static struct tapdisk_control_connection *
+tapdisk_control_allocate_connection(int fd)
+{
+       struct tapdisk_control_connection *connection;
+       size_t sz;
+
+       connection = calloc(1, sizeof(*connection));
+       if (!connection) {
+               EPRINTF("calloc");
+               return NULL;
+       }
+
+       connection->socket = fd;
+       return connection;
+}
+
+static void
+tapdisk_control_close_connection(struct tapdisk_control_connection *connection)
+{
+       tapdisk_server_unregister_event(connection->event_id);
+       close(connection->socket);
+       free(connection);
+}
+
+static int
+tapdisk_control_read_message(int fd, tapdisk_message_t *message, int timeout)
+{
+       fd_set readfds;
+       int ret, len, offset;
+       struct timeval tv, *t;
+
+       t      = NULL;
+       offset = 0;
+       len    = sizeof(tapdisk_message_t);
+
+       if (timeout) {
+               tv.tv_sec  = timeout;
+               tv.tv_usec = 0;
+               t = &tv;
+       }
+
+       memset(message, 0, sizeof(tapdisk_message_t));
+
+       while (offset < len) {
+               FD_ZERO(&readfds);
+               FD_SET(fd, &readfds);
+
+               ret = select(fd + 1, &readfds, NULL, NULL, t);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(fd, &readfds)) {
+                       ret = read(fd, message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len) {
+               EPRINTF("failure reading message (wanted %d but got %d)\n",
+                       len, offset);
+               return -EIO;
+       }
+
+       DPRINTF("received '%s' message (uuid = %u)\n",
+               tapdisk_message_name(message->type), message->cookie);
+
+       return 0;
+}
+
+static int
+tapdisk_control_write_message(int fd, tapdisk_message_t *message, int timeout)
+{
+       fd_set writefds;
+       int ret, len, offset;
+       struct timeval tv, *t;
+
+       t      = NULL;
+       offset = 0;
+       len    = sizeof(tapdisk_message_t);
+
+       if (timeout) {
+               tv.tv_sec  = timeout;
+               tv.tv_usec = 0;
+               t = &tv;
+       }
+
+       DPRINTF("sending '%s' message (uuid = %u)\n",
+               tapdisk_message_name(message->type), message->cookie);
+
+       while (offset < len) {
+               FD_ZERO(&writefds);
+               FD_SET(fd, &writefds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(fd + 1, NULL, &writefds, NULL, t);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(fd, &writefds)) {
+                       ret = write(fd, message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len) {
+               EPRINTF("failure writing message\n");
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_control_validate_request(tapdisk_message_t *request)
+{
+       if (strnlen(request->u.params.path,
+                   TAPDISK_MESSAGE_MAX_PATH_LENGTH) >=
+           TAPDISK_MESSAGE_MAX_PATH_LENGTH)
+               return EINVAL;
+
+       return 0;
+}
+
+static void
+tapdisk_control_list_minors(struct tapdisk_control_connection *connection,
+                           tapdisk_message_t *request)
+{
+       int i;
+       td_vbd_t *vbd;
+       struct list_head *head;
+       tapdisk_message_t response;
+
+       i = 0;
+       memset(&response, 0, sizeof(response));
+
+       response.type = TAPDISK_MESSAGE_LIST_MINORS_RSP;
+       response.cookie = request->cookie;
+
+       head = tapdisk_server_get_all_vbds();
+
+       list_for_each_entry(vbd, head, next) {
+               response.u.minors.list[i++] = vbd->minor;
+               if (i >= TAPDISK_MESSAGE_MAX_MINORS) {
+                       response.type = TAPDISK_MESSAGE_ERROR;
+                       response.u.response.error = ERANGE;
+                       break;
+               }
+       }
+
+       response.u.minors.count = i;
+       tapdisk_control_write_message(connection->socket, &response, 2);
+       tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_list(struct tapdisk_control_connection *connection,
+                    tapdisk_message_t *request)
+{
+       td_vbd_t *vbd;
+       struct list_head *head;
+       tapdisk_message_t response;
+       int count, i;
+
+       memset(&response, 0, sizeof(response));
+       response.type = TAPDISK_MESSAGE_LIST_RSP;
+       response.cookie = request->cookie;
+
+       head = tapdisk_server_get_all_vbds();
+
+       count = 0;
+       list_for_each_entry(vbd, head, next)
+               count++;
+
+       list_for_each_entry(vbd, head, next) {
+               response.u.list.count   = count--;
+               response.u.list.minor   = vbd->minor;
+               response.u.list.state   = vbd->state;
+               response.u.list.path[0] = 0;
+
+               if (!list_empty(&vbd->images)) {
+                       td_image_t *image = list_entry(vbd->images.next,
+                                                      td_image_t, next);
+                       snprintf(response.u.list.path,
+                                sizeof(response.u.list.path),
+                                "%s:%s",
+                                tapdisk_disk_types[image->type]->name,
+                                image->name);
+               }
+
+               tapdisk_control_write_message(connection->socket, &response, 2);
+       }
+
+       response.u.list.count   = count;
+       response.u.list.minor   = -1;
+       response.u.list.path[0] = 0;
+
+       tapdisk_control_write_message(connection->socket, &response, 2);
+       tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_get_pid(struct tapdisk_control_connection *connection,
+                       tapdisk_message_t *request)
+{
+       tapdisk_message_t response;
+
+       memset(&response, 0, sizeof(response));
+       response.type = TAPDISK_MESSAGE_PID_RSP;
+       response.cookie = request->cookie;
+       response.u.tapdisk_pid = getpid();
+
+       tapdisk_control_write_message(connection->socket, &response, 2);
+       tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_attach_vbd(struct tapdisk_control_connection *connection,
+                          tapdisk_message_t *request)
+{
+       tapdisk_message_t response;
+       char *devname;
+       td_vbd_t *vbd;
+       struct blktap2_params params;
+       image_t image;
+       int minor, err;
+
+       /*
+        * TODO: check for max vbds per process
+        */
+
+       vbd = tapdisk_server_get_vbd(request->cookie);
+       if (vbd) {
+               err = -EEXIST;
+               goto out;
+       }
+
+       minor = request->cookie;
+       if (minor < 0) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       vbd = tapdisk_vbd_create(minor);
+       if (!vbd) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = asprintf(&devname, BLKTAP2_RING_DEVICE"%d", minor);
+       if (err == -1) {
+               err = -ENOMEM;
+               goto fail_vbd;
+       }
+
+       err = tapdisk_vbd_attach(vbd, devname, minor);
+       free(devname);
+       if (err)
+               goto fail_vbd;
+
+       tapdisk_server_add_vbd(vbd);
+
+out:
+       memset(&response, 0, sizeof(response));
+       response.type = TAPDISK_MESSAGE_ATTACH_RSP;
+       response.cookie = request->cookie;
+       response.u.response.error = -err;
+
+       tapdisk_control_write_message(connection->socket, &response, 2);
+       tapdisk_control_close_connection(connection);
+
+       return;
+
+fail_vbd:
+       tapdisk_vbd_detach(vbd);
+       free(vbd);
+       goto out;
+}
+
+
+static void
+tapdisk_control_detach_vbd(struct tapdisk_control_connection *connection,
+                          tapdisk_message_t *request)
+{
+       tapdisk_message_t response;
+       td_vbd_t *vbd;
+       int err;
+
+       vbd = tapdisk_server_get_vbd(request->cookie);
+       if (!vbd) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       tapdisk_vbd_detach(vbd);
+
+       if (list_empty(&vbd->images)) {
+               tapdisk_server_remove_vbd(vbd);
+               free(vbd);
+       }
+
+       err = 0;
+out:
+       memset(&response, 0, sizeof(response));
+       response.type = TAPDISK_MESSAGE_DETACH_RSP;
+       response.cookie = request->cookie;
+       response.u.response.error = -err;
+
+       tapdisk_control_write_message(connection->socket, &response, 2);
+       tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_open_image(struct tapdisk_control_connection *connection,
+                          tapdisk_message_t *request)
+{
+       int err;
+       image_t image;
+       td_vbd_t *vbd;
+       td_flag_t flags;
+       tapdisk_message_t response;
+       struct blktap2_params params;
+
+       vbd = tapdisk_server_get_vbd(request->cookie);
+       if (!vbd) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (vbd->minor == -1) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (vbd->name) {
+               err = -EALREADY;
+               goto out;
+       }
+
+       flags = 0;
+       if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_RDONLY)
+               flags |= TD_OPEN_RDONLY;
+       if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_SHARED)
+               flags |= TD_OPEN_SHAREABLE;
+       if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_ADD_CACHE)
+               flags |= TD_OPEN_ADD_CACHE;
+       if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_VHD_INDEX)
+               flags |= TD_OPEN_VHD_INDEX;
+       if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_LOG_DIRTY)
+               flags |= TD_OPEN_LOG_DIRTY;
+
+       vbd->name = strndup(request->u.params.path,
+                           sizeof(request->u.params.path));
+       if (!vbd->name) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = tapdisk_vbd_parse_stack(vbd, request->u.params.path);
+       if (err)
+               goto out;
+
+       err = tapdisk_vbd_open_stack(vbd, request->u.params.storage, flags);
+       if (err)
+               goto out;
+
+       err = tapdisk_vbd_get_image_info(vbd, &image);
+       if (err)
+               goto fail_close;
+
+       params.capacity = image.size;
+       params.sector_size = image.secsize;
+       strncpy(params.name, vbd->name, BLKTAP2_MAX_MESSAGE_LEN);
+
+       err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_CREATE_DEVICE, &params);
+       if (err && errno != EEXIST) {
+               err = -errno;
+               EPRINTF("create device failed: %d\n", err);
+               goto fail_close;
+       }
+
+       err = 0;
+
+out:
+       memset(&response, 0, sizeof(response));
+       response.cookie = request->cookie;
+
+       if (err) {
+               response.type                = TAPDISK_MESSAGE_ERROR;
+               response.u.response.error    = -err;
+       } else {
+               response.u.image.sectors     = image.size;
+               response.u.image.sector_size = image.secsize;
+               response.u.image.info        = image.info;
+               response.type                = TAPDISK_MESSAGE_OPEN_RSP;
+       }
+
+       tapdisk_control_write_message(connection->socket, &response, 2);
+       tapdisk_control_close_connection(connection);
+
+       return;
+
+fail_close:
+       tapdisk_vbd_close_vdi(vbd);
+       free(vbd->name);
+       vbd->name = NULL;
+       goto out;
+}
+
+static void
+tapdisk_control_close_image(struct tapdisk_control_connection *connection,
+                           tapdisk_message_t *request)
+{
+       tapdisk_message_t response;
+       td_vbd_t *vbd;
+       int err;
+
+       vbd = tapdisk_server_get_vbd(request->cookie);
+       if (!vbd) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (!list_empty(&vbd->pending_requests)) {
+               err = -EAGAIN;
+               goto out;
+       }
+
+       tapdisk_vbd_close_vdi(vbd);
+
+       /* NB. vbd->name free should probably belong into close_vdi,
+          but the current blktap1 reopen-stuff likely depends on a
+          lifetime extended until shutdown. */
+       free(vbd->name);
+       vbd->name = NULL;
+
+       if (vbd->minor == -1) {
+               tapdisk_server_remove_vbd(vbd);
+               tapdisk_vbd_free(vbd);
+       }
+
+       err = 0;
+out:
+       memset(&response, 0, sizeof(response));
+       response.type = TAPDISK_MESSAGE_CLOSE_RSP;
+       response.cookie = request->cookie;
+       response.u.response.error = -err;
+
+       tapdisk_control_write_message(connection->socket, &response, 2);
+       tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_pause_vbd(struct tapdisk_control_connection *connection,
+                         tapdisk_message_t *request)
+{
+       int err;
+       td_vbd_t *vbd;
+       tapdisk_message_t response;
+
+       memset(&response, 0, sizeof(response));
+
+       response.type = TAPDISK_MESSAGE_PAUSE_RSP;
+
+       vbd = tapdisk_server_get_vbd(request->cookie);
+       if (!vbd) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       do {
+               err = tapdisk_vbd_pause(vbd);
+
+               if (!err || err != -EAGAIN)
+                       break;
+
+               tapdisk_server_iterate();
+       } while (1);
+
+out:
+       response.cookie = request->cookie;
+       response.u.response.error = -err;
+       tapdisk_control_write_message(connection->socket, &response, 2);
+       tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_resume_vbd(struct tapdisk_control_connection *connection,
+                          tapdisk_message_t *request)
+{
+       int err;
+       td_vbd_t *vbd;
+       tapdisk_message_t response;
+
+       memset(&response, 0, sizeof(response));
+
+       response.type = TAPDISK_MESSAGE_RESUME_RSP;
+
+       vbd = tapdisk_server_get_vbd(request->cookie);
+       if (!vbd) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (request->u.params.path[0]) {
+               free(vbd->name);
+               vbd->name = strndup(request->u.params.path,
+                                   sizeof(request->u.params.path));
+               if (!vbd->name) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+       } else if (!vbd->name) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = tapdisk_vbd_parse_stack(vbd, vbd->name);
+       if (err)
+               goto out;
+
+       err = tapdisk_vbd_resume(vbd, NULL, -1);
+       if (err)
+               goto out;
+
+out:
+       response.cookie = request->cookie;
+       response.u.response.error = -err;
+       tapdisk_control_write_message(connection->socket, &response, 2);
+       tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_handle_request(event_id_t id, char mode, void *private)
+{
+       int err;
+       tapdisk_message_t message;
+       struct tapdisk_control_connection *connection =
+               (struct tapdisk_control_connection *)private;
+
+       if (tapdisk_control_read_message(connection->socket, &message, 2)) {
+               EPRINTF("failed to read message from %d\n", connection->socket);
+               tapdisk_control_close_connection(connection);
+               return;
+       }
+
+       err = tapdisk_control_validate_request(&message);
+       if (err)
+               goto fail;
+
+       switch (message.type) {
+       case TAPDISK_MESSAGE_PID:
+               return tapdisk_control_get_pid(connection, &message);
+       case TAPDISK_MESSAGE_LIST_MINORS:
+               return tapdisk_control_list_minors(connection, &message);
+       case TAPDISK_MESSAGE_LIST:
+               return tapdisk_control_list(connection, &message);
+       case TAPDISK_MESSAGE_ATTACH:
+               return tapdisk_control_attach_vbd(connection, &message);
+       case TAPDISK_MESSAGE_DETACH:
+               return tapdisk_control_detach_vbd(connection, &message);
+       case TAPDISK_MESSAGE_OPEN:
+               return tapdisk_control_open_image(connection, &message);
+       case TAPDISK_MESSAGE_PAUSE:
+               return tapdisk_control_pause_vbd(connection, &message);
+       case TAPDISK_MESSAGE_RESUME:
+               return tapdisk_control_resume_vbd(connection, &message);
+       case TAPDISK_MESSAGE_CLOSE:
+               return tapdisk_control_close_image(connection, &message);
+       default: {
+               tapdisk_message_t response;
+       fail:
+
+               EPRINTF("received unsupported message '%s'\n",
+                       tapdisk_message_name(message.type));
+
+               memset(&response, 0, sizeof(response));
+
+               response.type = TAPDISK_MESSAGE_ERROR;
+               response.u.response.error = (err ? -err : EINVAL);
+               tapdisk_control_write_message(connection->socket, &response, 2);
+
+               tapdisk_control_close_connection(connection);
+               break;
+       }
+       }
+}
+
+static void
+tapdisk_control_accept(event_id_t id, char mode, void *private)
+{
+       int err, fd;
+       struct tapdisk_control_connection *connection;
+
+       fd = accept(td_control.socket, NULL, NULL);
+       if (fd == -1) {
+               EPRINTF("failed to accept new control connection: %d\n", errno);
+               return;
+       }
+
+       connection = tapdisk_control_allocate_connection(fd);
+       if (!connection) {
+               close(fd);
+               EPRINTF("failed to allocate new control connection\n");
+       }
+
+       err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           connection->socket, 0,
+                                           tapdisk_control_handle_request,
+                                           connection);
+       if (err == -1) {
+               close(fd);
+               free(connection);
+               EPRINTF("failed to register new control event: %d\n", err);
+       }
+
+       connection->event_id = err;
+}
+
+static int
+tapdisk_control_mkdir(const char *dir)
+{
+       int err;
+       char *ptr, *name, *start;
+
+       err = access(dir, W_OK | R_OK);
+       if (!err)
+               return 0;
+
+       name = strdup(dir);
+       if (!name)
+               return -ENOMEM;
+
+       start = name;
+
+       for (;;) {
+               ptr = strchr(start + 1, '/');
+               if (ptr)
+                       *ptr = '\0';
+
+               err = mkdir(name, 0755);
+               if (err && errno != EEXIST) {
+                       err = -errno;
+                       EPRINTF("failed to create directory %s: %d\n",
+                                 name, err);
+                       break;
+               }
+
+               if (!ptr)
+                       break;
+               else {
+                       *ptr = '/';
+                       start = ptr + 1;
+               }
+       }
+
+       free(name);
+       return err;
+}
+
+static int
+tapdisk_control_create_socket(char **socket_path)
+{
+       int err, flags;
+       struct sockaddr_un saddr;
+
+       err = tapdisk_control_mkdir(BLKTAP2_CONTROL_DIR);
+       if (err) {
+               EPRINTF("failed to create directory %s: %d\n",
+                       BLKTAP2_CONTROL_DIR, err);
+               return err;
+       }
+
+       err = asprintf(&td_control.path, "%s/%s%d",
+                      BLKTAP2_CONTROL_DIR, BLKTAP2_CONTROL_SOCKET, getpid());
+       if (err == -1) {
+               td_control.path = NULL;
+               err = (errno ? : ENOMEM);
+               goto fail;
+       }
+
+       if (unlink(td_control.path) && errno != ENOENT) {
+               err = errno;
+               EPRINTF("failed to unlink %s: %d\n", td_control.path, errno);
+               goto fail;
+       }
+
+       td_control.socket = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (td_control.socket == -1) {
+               err = errno;
+               EPRINTF("failed to create control socket: %d\n", err);
+               goto fail;
+       }
+
+       memset(&saddr, 0, sizeof(saddr));
+       strncpy(saddr.sun_path, td_control.path, sizeof(saddr.sun_path));
+       saddr.sun_family = AF_UNIX;
+
+       err = bind(td_control.socket,
+                  (const struct sockaddr *)&saddr, sizeof(saddr));
+       if (err == -1) {
+               err = errno;
+               EPRINTF("failed to bind to %s: %d\n", saddr.sun_path, err);
+               goto fail;
+       }
+
+       err = listen(td_control.socket, 10);
+       if (err == -1) {
+               err = errno;
+               EPRINTF("failed to listen: %d\n", err);
+               goto fail;
+       }
+
+       err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           td_control.socket, 0,
+                                           tapdisk_control_accept, NULL);
+       if (err < 0) {
+               EPRINTF("failed to add watch: %d\n", err);
+               goto fail;
+       }
+
+       td_control.event_id = err;
+       *socket_path = td_control.path;
+
+       return 0;
+
+fail:
+       tapdisk_control_close();
+       return err;
+}
+
+int
+tapdisk_control_open(char **path)
+{
+       int err;
+
+       tapdisk_control_initialize();
+
+       return tapdisk_control_create_socket(path);
+}
diff --git a/tools/blktap2/drivers/tapdisk-control.h b/tools/blktap2/drivers/tapdisk-control.h
new file mode 100644 (file)
index 0000000..10c1811
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __TAPDISK_CONTROL_H__
+#define __TAPDISK_CONTROL_H__
+
+int tapdisk_control_open(char **path);
+void tapdisk_control_close(void);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-diff.c b/tools/blktap2/drivers/tapdisk-diff.c
new file mode 100644 (file)
index 0000000..056d4c9
--- /dev/null
@@ -0,0 +1,802 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <libgen.h>    /* for basename(3) */
+#include <unistd.h>
+
+#include "list.h"
+#include "scheduler.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+#include "tapdisk-disktype.h"
+#include "tapdisk-utils.h"
+#include "libvhd.h"
+
+#define POLL_READ                        0
+#define POLL_WRITE                       1
+
+#define SPB_SHIFT (VHD_BLOCK_SHIFT - SECTOR_SHIFT)
+
+struct tapdisk_stream_poll {
+       int                              pipe[2];
+       int                              set;
+};
+
+struct tapdisk_stream_request {
+       uint64_t                         sec;
+       uint32_t                         secs;
+       uint64_t                         seqno;
+       blkif_request_t                  blkif_req;
+       struct list_head                 next;
+};
+
+struct tapdisk_stream {
+       td_vbd_t                        *vbd;
+
+       unsigned int                     id;
+
+       int                              err;
+
+       uint64_t                         cur;
+       uint64_t                         start;
+       uint64_t                         end;
+
+       uint64_t                         started;
+       uint64_t                         completed;
+
+       struct tapdisk_stream_poll       poll;
+       event_id_t                       enqueue_event_id;
+
+       struct list_head                 free_list;
+       struct list_head                 pending_list;
+       struct list_head                 completed_list;
+
+       struct tapdisk_stream_request    requests[MAX_REQUESTS];
+};
+
+static unsigned int tapdisk_stream_count;
+
+static void tapdisk_stream_close_image(struct tapdisk_stream *);
+
+static char *program;
+static struct tapdisk_stream stream1, stream2;
+static vhd_context_t vhd1;
+
+static void
+usage(FILE *stream)
+{
+       printf("usage: %s <-n type:/path/to/image> <-m type:/path/to/image>\n",
+                       program);
+}
+
+static int
+open_vhd(const char *path, vhd_context_t *vhd)
+{
+       int err;
+
+       err = vhd_open(vhd, path, VHD_OPEN_RDONLY);
+       if (err) {
+               printf("error opening %s: %d\n", path, err);
+               return err;
+       }
+
+       err = vhd_get_bat(vhd);
+       if (err)
+       {
+               printf("error reading BAT for %s: %d\n", path, err);
+               vhd_close(vhd);
+               return err;
+       }
+
+       return 0;
+}
+
+static inline void
+tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p)
+{
+       p->set = 0;
+       p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1;
+}
+
+static int
+tapdisk_stream_poll_open(struct tapdisk_stream_poll *p)
+{
+       int err;
+
+       tapdisk_stream_poll_initialize(p);
+
+       err = pipe(p->pipe);
+       if (err)
+               return -errno;
+
+       err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK);
+       if (err)
+               goto out;
+
+       err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK);
+       if (err)
+               goto out;
+
+       return 0;
+
+out:
+       close(p->pipe[POLL_READ]);
+       close(p->pipe[POLL_WRITE]);
+       tapdisk_stream_poll_initialize(p);
+       return -errno;
+}
+
+static void
+tapdisk_stream_poll_close(struct tapdisk_stream_poll *p)
+{
+       if (p->pipe[POLL_READ] != -1)
+               close(p->pipe[POLL_READ]);
+       if (p->pipe[POLL_WRITE] != -1)
+               close(p->pipe[POLL_WRITE]);
+       tapdisk_stream_poll_initialize(p);
+}
+
+static inline void
+tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p)
+{
+       int dummy;
+
+       read_exact(p->pipe[POLL_READ], &dummy, sizeof(dummy));
+       p->set = 0;
+}
+
+static inline void
+tapdisk_stream_poll_set(struct tapdisk_stream_poll *p)
+{
+       int dummy = 0;
+
+       if (!p->set) {
+               write_exact(p->pipe[POLL_WRITE], &dummy, sizeof(dummy));
+               p->set = 1;
+       }
+}
+
+static inline int
+tapdisk_stream_stop(struct tapdisk_stream *s)
+{
+       return ((s->cur == s->end || s->err) &&
+                       list_empty(&s->pending_list) && 
+                       list_empty(&s->completed_list));
+}
+
+static inline void
+tapdisk_stream_initialize_request(struct tapdisk_stream_request *req)
+{
+       memset(req, 0, sizeof(*req));
+       INIT_LIST_HEAD(&req->next);
+}
+
+static inline int
+tapdisk_stream_request_idx(struct tapdisk_stream *s,
+                          struct tapdisk_stream_request *req)
+{
+       return (req - s->requests);
+}
+
+static inline struct tapdisk_stream_request *
+tapdisk_stream_get_request(struct tapdisk_stream *s)
+{
+       struct tapdisk_stream_request *req;
+
+       if (list_empty(&s->free_list))
+               return NULL;
+
+       req = list_entry(s->free_list.next,
+                        struct tapdisk_stream_request, next);
+
+       list_del_init(&req->next);
+       tapdisk_stream_initialize_request(req);
+
+       return req;
+}
+
+static inline void
+tapdisk_stream_queue_completed(struct tapdisk_stream *s,
+                              struct tapdisk_stream_request *sreq)
+{
+       struct tapdisk_stream_request *itr;
+
+       list_for_each_entry(itr, &s->completed_list, next)
+               if (sreq->seqno < itr->seqno) {
+                       list_add_tail(&sreq->next, &itr->next);
+                       return;
+               }
+
+       list_add_tail(&sreq->next, &s->completed_list);
+}
+
+static int 
+tapdisk_result_compare(struct tapdisk_stream_request *sreq1,
+               struct tapdisk_stream_request  *sreq2)
+{
+       unsigned long idx1, idx2;
+       char *buf1, *buf2;
+       int result;
+
+       assert(sreq1->seqno == sreq2->seqno);
+       assert(sreq1->secs == sreq2->secs);
+       idx1 = (unsigned long)tapdisk_stream_request_idx(&stream1, 
+                       sreq1);
+       idx2 = (unsigned long)tapdisk_stream_request_idx(&stream2,
+                       sreq2);
+       buf1 = (char *)MMAP_VADDR(stream1.vbd->ring.vstart, idx1, 0);
+       buf2 = (char *)MMAP_VADDR(stream2.vbd->ring.vstart, idx2, 0);
+
+       result = memcmp(buf1, buf2, sreq1->secs << SECTOR_SHIFT);
+       return result;
+}
+
+static int
+tapdisk_stream_process_data(void)
+{
+       struct tapdisk_stream_request *sreq1, *sreq2, *tmp1, *tmp2;
+       int advance_both;
+       int result = 0;
+
+       sreq1 = list_entry(stream1.completed_list.next,
+                       struct tapdisk_stream_request, next);
+       sreq2 = list_entry(stream2.completed_list.next,
+                       struct tapdisk_stream_request, next);
+       tmp1 = list_entry(sreq1->next.next,
+                       struct tapdisk_stream_request, next);
+       tmp2 = list_entry(sreq2->next.next,
+                       struct tapdisk_stream_request, next);
+       while (result == 0 &&
+                       &sreq1->next != &stream1.completed_list &&
+                       &sreq2->next != &stream2.completed_list) {
+               //printf("checking: %llu|%llu\n", sreq1->seqno, sreq2->seqno);
+               advance_both = 1;
+               if (sreq1->seqno < sreq2->seqno) {
+                       advance_both = 0;
+                       goto advance1;
+               }
+               if (sreq1->seqno > sreq2->seqno)
+                       goto advance2;
+
+               result = tapdisk_result_compare(sreq1, sreq2);
+
+               stream1.completed++;
+               stream2.completed++;
+               
+               list_del_init(&sreq1->next);
+               list_add_tail(&sreq1->next, &stream1.free_list);
+               list_del_init(&sreq2->next);
+               list_add_tail(&sreq2->next, &stream2.free_list);
+
+advance1:
+               sreq1 = tmp1;
+               tmp1 = list_entry(tmp1->next.next, 
+                               struct tapdisk_stream_request, next);
+               if (!advance_both)
+                       continue;
+advance2:
+               sreq2 = tmp2;
+               tmp2 = list_entry(tmp2->next.next, 
+                               struct tapdisk_stream_request, next);
+       }
+
+       return result;
+}
+
+static void
+tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp)
+{
+       struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+       struct tapdisk_stream_request *sreq = s->requests + rsp->id;
+
+       list_del_init(&sreq->next);
+
+       if (rsp->status == BLKIF_RSP_OKAY)
+               tapdisk_stream_queue_completed(s, sreq);
+       else {
+               s->err = EIO;
+               list_add_tail(&sreq->next, &s->free_list);
+               fprintf(stderr, "error reading sector 0x%"PRIx64"\n", sreq->sec);
+       }
+
+       if (tapdisk_stream_process_data()) {
+               fprintf(stderr, "mismatch at sector 0x%"PRIx64"\n",
+                               sreq->sec);
+               stream1.err = EINVAL;
+               stream2.err = EINVAL;
+       }
+
+       tapdisk_stream_poll_set(&stream1.poll);
+       tapdisk_stream_poll_set(&stream2.poll);
+}
+
+static inline int
+tapdisk_stream_enqueue_copy(struct tapdisk_stream *s, 
+               struct tapdisk_stream_request *r)
+{
+       td_vbd_t *vbd;
+       blkif_request_t *breq;
+       td_vbd_request_t *vreq;
+       struct tapdisk_stream_request *sreq;
+       int idx;
+
+       vbd = stream2.vbd;
+       sreq = tapdisk_stream_get_request(s);
+       if (!sreq)
+               return 1;
+
+       idx                 = tapdisk_stream_request_idx(s, sreq);
+
+       sreq->sec           = r->sec;
+       sreq->secs          = r->secs;
+       sreq->seqno         = r->seqno;
+
+       breq                = &sreq->blkif_req;
+       breq->id            = idx;
+       breq->nr_segments   = r->blkif_req.nr_segments;
+       breq->sector_number = r->blkif_req.sector_number;
+       breq->operation     = BLKIF_OP_READ;
+
+       for (int i = 0; i < r->blkif_req.nr_segments; i++) {
+               struct blkif_request_segment *seg = breq->seg + i;
+               seg->first_sect = r->blkif_req.seg[i].first_sect;
+               seg->last_sect  = r->blkif_req.seg[i].last_sect;
+       }
+       s->cur += sreq->secs;
+
+       vreq = vbd->request_list + idx;
+       assert(list_empty(&vreq->next));
+       assert(vreq->secs_pending == 0);
+
+       memcpy(&vreq->req, breq, sizeof(*breq));
+       vbd->received++;
+       vreq->vbd = vbd;
+
+       tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+       list_add_tail(&sreq->next, &s->pending_list);
+
+       return 0;
+}
+
+static void
+tapdisk_stream_enqueue1(void)
+{
+       td_vbd_t *vbd;
+       int i, idx, psize, blk;
+       struct tapdisk_stream *s = &stream1;
+
+       vbd = s->vbd;
+       psize = getpagesize();
+
+       while (s->cur < s->end && !s->err) {
+               blkif_request_t *breq;
+               td_vbd_request_t *vreq;
+               struct tapdisk_stream_request *sreq;
+
+               /* skip any blocks that are not present in this image */
+               blk = s->cur >> SPB_SHIFT;
+               while (s->cur < s->end && vhd1.bat.bat[blk] == DD_BLK_UNUSED) {
+                       //printf("skipping block %d\n", blk);
+                       blk++;
+                       s->cur = blk << SPB_SHIFT;
+               }
+
+               if (s->cur >= s->end)
+                       break;
+
+               sreq = tapdisk_stream_get_request(s);
+               if (!sreq)
+                       break;
+
+               idx                 = tapdisk_stream_request_idx(s, sreq);
+
+               sreq->sec           = s->cur;
+               sreq->secs          = 0;
+               sreq->seqno         = s->started++;
+
+               breq                = &sreq->blkif_req;
+               breq->id            = idx;
+               breq->nr_segments   = 0;
+               breq->sector_number = sreq->sec;
+               breq->operation     = BLKIF_OP_READ;
+
+               for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) {
+                       uint32_t secs;
+                       struct blkif_request_segment *seg = breq->seg + i;
+
+                       secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT);
+                       secs = MIN(((blk + 1) << SPB_SHIFT) - s->cur, secs);
+                       if (!secs)
+                               break;
+
+                       sreq->secs += secs;
+                       s->cur     += secs;
+
+                       seg->first_sect = 0;
+                       seg->last_sect  = secs - 1;
+                       breq->nr_segments++;
+               }
+
+               vreq = vbd->request_list + idx;
+
+               assert(list_empty(&vreq->next));
+               assert(vreq->secs_pending == 0);
+
+               memcpy(&vreq->req, breq, sizeof(*breq));
+               vbd->received++;
+               vreq->vbd = vbd;
+
+               tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+               list_add_tail(&sreq->next, &s->pending_list);
+       }
+
+       tapdisk_vbd_issue_requests(vbd);
+}
+
+static void
+tapdisk_stream_enqueue2(void)
+{
+       td_vbd_t *vbd;
+       int i, blk;
+       struct tapdisk_stream_request *itr;
+       struct tapdisk_stream *s = &stream2;
+
+       vbd = s->vbd;
+
+       /* issue the same requests that we issued on stream1 */
+       list_for_each_entry(itr, &stream1.completed_list, next) {
+               if (itr->sec < s->cur)
+                       continue;
+               if (tapdisk_stream_enqueue_copy(s, itr))
+                       goto done;
+       }
+
+       list_for_each_entry(itr, &stream1.pending_list, next) {
+               if (itr->sec < s->cur)
+                       continue;
+               if (tapdisk_stream_enqueue_copy(s, itr))
+                       goto done;
+       }
+
+       stream2.cur = stream1.cur;
+
+done:
+       tapdisk_vbd_issue_requests(vbd);
+}
+
+static inline int
+tapdisk_diff_done(void)
+{
+       return (tapdisk_stream_stop(&stream1) && tapdisk_stream_stop(&stream2));
+}
+
+static void
+tapdisk_diff_stop(void)
+{
+       tapdisk_stream_close_image(&stream1);
+       tapdisk_stream_close_image(&stream2);
+}
+
+static void
+tapdisk_stream_enqueue(event_id_t id, char mode, void *arg)
+{
+       struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+
+       tapdisk_stream_poll_clear(&s->poll);
+
+       if (tapdisk_diff_done()) {
+               tapdisk_diff_stop();
+               return;
+       }
+
+       if (s == &stream1) 
+               tapdisk_stream_enqueue1();
+       else if (s == &stream2)
+               tapdisk_stream_enqueue2();
+       else
+               assert(0);
+
+       if (tapdisk_diff_done()) {
+               // we have to check again for the case when stream1 had no 
+               // blocks at all
+               tapdisk_diff_stop();
+               return;
+       }
+}
+
+static int
+tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type)
+{
+       int err;
+       image_t image;
+
+       s->id = tapdisk_stream_count++;
+
+       err = tapdisk_vbd_initialize(s->id);
+       if (err)
+               goto out;
+
+       s->vbd = tapdisk_server_get_vbd(s->id);
+       if (!s->vbd) {
+               err = ENODEV;
+               goto out;
+       }
+
+       tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s);
+
+       err = tapdisk_vbd_open_vdi(s->vbd, path, type,
+                                  TAPDISK_STORAGE_TYPE_DEFAULT,
+                                  TD_OPEN_RDONLY);
+       if (err)
+               goto out;
+
+       s->vbd->reopened = 1;
+
+       err = tapdisk_vbd_get_image_info(s->vbd, &image);
+       if (err) {
+               fprintf(stderr, "failed getting image size: %d\n", err);
+               return err;
+       }
+
+       s->start = 0;
+       s->cur   = s->start;
+       s->end   = image.size;
+
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to open image %s: %d\n", path, err);
+       return err;
+}
+
+static void
+tapdisk_stream_close_image(struct tapdisk_stream *s)
+{
+       td_vbd_t *vbd;
+
+       vbd = tapdisk_server_get_vbd(s->id);
+       if (vbd) {
+               tapdisk_vbd_close_vdi(vbd);
+               tapdisk_server_remove_vbd(vbd);
+               free((void *)vbd->ring.vstart);
+               free(vbd->name);
+               free(vbd);
+               s->vbd = NULL;
+       }
+}
+
+static int
+tapdisk_stream_initialize_requests(struct tapdisk_stream *s)
+{
+       size_t size;
+       td_ring_t *ring;
+       int err, i, psize;
+
+       ring  = &s->vbd->ring;
+       psize = getpagesize();
+       size  = psize * BLKTAP_MMAP_REGION_SIZE;
+
+       /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */
+       err = posix_memalign((void **)&ring->vstart, psize, size);
+       if (err) {
+               fprintf(stderr, "failed to allocate buffers: %d\n", err);
+               ring->vstart = 0;
+               return err;
+       }
+
+       for (i = 0; i < MAX_REQUESTS; i++) {
+               struct tapdisk_stream_request *req = s->requests + i;
+               tapdisk_stream_initialize_request(req);
+               list_add_tail(&req->next, &s->free_list);
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s)
+{
+       int err;
+       struct tapdisk_stream_poll *p = &s->poll;
+
+       err = tapdisk_stream_poll_open(p);
+       if (err)
+               goto out;
+
+       err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           p->pipe[POLL_READ], 0,
+                                           tapdisk_stream_enqueue, s);
+       if (err < 0)
+               goto out;
+
+       s->enqueue_event_id = err;
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to register event: %d\n", err);
+       return err;
+}
+
+static void
+tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s)
+{
+       if (s->enqueue_event_id) {
+               tapdisk_server_unregister_event(s->enqueue_event_id);
+               s->enqueue_event_id = 0;
+       }
+       tapdisk_stream_poll_close(&s->poll);
+}
+
+static inline void
+tapdisk_stream_initialize(struct tapdisk_stream *s)
+{
+       memset(s, 0, sizeof(*s));
+       INIT_LIST_HEAD(&s->free_list);
+       INIT_LIST_HEAD(&s->pending_list);
+       INIT_LIST_HEAD(&s->completed_list);
+}
+
+static int
+tapdisk_stream_open(struct tapdisk_stream *s, const char *arg)
+{
+       int err, type;
+       const char *path;
+
+       type = tapdisk_disktype_parse_params(arg, &path);
+       if (type < 0)
+               return type;
+
+       tapdisk_stream_initialize(s);
+
+       err = tapdisk_stream_open_image(s, path, type);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_initialize_requests(s);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_register_enqueue_event(s);
+       if (err)
+               return err;
+
+       tapdisk_stream_enqueue(s->enqueue_event_id, 
+                              SCHEDULER_POLL_READ_FD, s);
+
+       return 0;
+}
+
+static void
+tapdisk_stream_release(struct tapdisk_stream *s)
+{
+       tapdisk_stream_close_image(s);
+       tapdisk_stream_unregister_enqueue_event(s);
+}
+
+static int
+tapdisk_stream_run(struct tapdisk_stream *s)
+{
+       tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s);
+       tapdisk_server_run();
+       return s->err;
+}
+
+int
+main(int argc, char *argv[])
+{
+       int c, err, type1;
+       const char *arg1 = NULL, *arg2 = NULL;
+       const disk_info_t *info;
+       const char *path1;
+
+       err    = 0;
+
+       program = basename(argv[0]);
+       
+       while ((c = getopt(argc, argv, "n:m:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       arg1 = optarg;
+                       break;
+               case 'm':
+                       arg2 = optarg;
+                       break;
+               case 'h':
+                       usage(stdout);
+                       return 0;
+               default:
+                       goto fail_usage;
+               }
+       }
+
+       if (!arg1 || !arg2)
+               goto fail_usage;
+
+       type1 = tapdisk_disktype_parse_params(arg1, &path1);
+       if (type1 < 0)
+               return type1;
+
+       if (type1 != DISK_TYPE_VHD) {
+               printf("error: first VDI is not VHD\n");
+               return EINVAL;
+       }
+
+       err = open_vhd(path1, &vhd1);
+       if (err)
+               return err;
+
+       tapdisk_start_logging("tapdisk-diff");
+
+       err = tapdisk_server_initialize();
+       if (err)
+               goto out;
+
+       err = tapdisk_stream_open(&stream1, arg1);
+       if (err) {
+               fprintf(stderr, "Failed to open %s: %s\n", 
+                       arg1, strerror(-err));
+               goto out;
+       }
+
+       err = tapdisk_stream_open(&stream2, arg2);
+       if (err) {
+               fprintf(stderr, "Failed to open %s: %s\n", 
+                       arg2, strerror(-err));
+               goto out1;
+       }
+
+       if (stream1.end != stream2.end) {
+               fprintf(stderr, "Image sizes differ: %"PRIu64" != %"PRIu64"\n",
+                               stream1.end, stream2.end);
+               err = EINVAL;
+               goto out2;
+       }
+
+       tapdisk_server_run();
+       
+out2:
+       tapdisk_stream_release(&stream2);
+out1:
+       tapdisk_stream_release(&stream1);
+out:
+       vhd_close(&vhd1);
+       tapdisk_stop_logging();
+
+       return err ? : stream1.err;
+
+fail_usage:
+       usage(stderr);
+       return 1;
+}
diff --git a/tools/blktap2/drivers/tapdisk-disktype.c b/tools/blktap2/drivers/tapdisk-disktype.c
new file mode 100644 (file)
index 0000000..e89d364
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2007, 2010, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <errno.h>
+
+#include "tapdisk-disktype.h"
+#include "tapdisk-message.h"
+
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof *(a))
+
+static const disk_info_t aio_disk = {
+       "aio",
+       "raw image (aio)",
+       0,
+};
+
+static const disk_info_t sync_disk = {
+       "sync",
+       "raw image (sync)",
+       0,
+};
+
+static const disk_info_t vmdk_disk = {
+       "vmdk",
+       "vmware image (vmdk)",
+       1,
+};
+
+static const disk_info_t vhdsync_disk = {
+       "vhdsync",
+       "virtual server image (vhd) - synchronous",
+       1,
+};
+
+static const disk_info_t vhd_disk = {
+       "vhd",
+       "virtual server image (vhd)",
+       0,
+};
+
+
+static const disk_info_t ram_disk = {
+       "ram",
+       "ramdisk image (ram)",
+       1,
+};
+
+static const disk_info_t qcow_disk = {
+       "qcow",
+       "qcow disk (qcow)",
+       0,
+};
+
+static const disk_info_t block_cache_disk = {
+       "bc",
+       "block cache image (bc)",
+       1,
+};
+
+static const disk_info_t vhd_index_disk = {
+       "vhdi",
+       "vhd index image (vhdi)",
+       1,
+};
+
+static const disk_info_t log_disk = {
+       "log",
+       "write logger (log)",
+       0,
+};
+
+static const disk_info_t remus_disk = {
+       "remus",
+       "remus disk replicator (remus)",
+       0,
+};
+
+const disk_info_t *tapdisk_disk_types[] = {
+       [DISK_TYPE_AIO] = &aio_disk,
+       [DISK_TYPE_SYNC]        = &sync_disk,
+       [DISK_TYPE_VMDK]        = &vmdk_disk,
+       [DISK_TYPE_VHDSYNC]     = &vhdsync_disk,
+       [DISK_TYPE_VHD] = &vhd_disk,
+       [DISK_TYPE_RAM] = &ram_disk,
+       [DISK_TYPE_QCOW]        = &qcow_disk,
+       [DISK_TYPE_BLOCK_CACHE] = &block_cache_disk,
+       [DISK_TYPE_LOG] = &log_disk,
+       [DISK_TYPE_VINDEX]      = &vhd_index_disk,
+       [DISK_TYPE_REMUS]       = &remus_disk,
+};
+
+extern struct tap_disk tapdisk_aio;
+extern struct tap_disk tapdisk_vhdsync;
+extern struct tap_disk tapdisk_vhd;
+extern struct tap_disk tapdisk_ram;
+extern struct tap_disk tapdisk_qcow;
+extern struct tap_disk tapdisk_block_cache;
+extern struct tap_disk tapdisk_log;
+extern struct tap_disk tapdisk_remus;
+
+const struct tap_disk *tapdisk_disk_drivers[ARRAY_SIZE(tapdisk_disk_types)] = {
+       [DISK_TYPE_AIO]         = &tapdisk_aio,
+       [DISK_TYPE_VHD]         = &tapdisk_vhd,
+       [DISK_TYPE_RAM]         = &tapdisk_ram,
+       [DISK_TYPE_QCOW]        = &tapdisk_qcow,
+       [DISK_TYPE_BLOCK_CACHE] = &tapdisk_block_cache,
+       [DISK_TYPE_LOG]         = &tapdisk_log,
+       [DISK_TYPE_REMUS]       = &tapdisk_remus,
+};
+
+int
+tapdisk_disktype_find(const char *name)
+{
+       const disk_info_t *info;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(tapdisk_disk_types); ++i) {
+               info = tapdisk_disk_types[i];
+               if (!info)
+                       continue;
+
+               if (strcmp(name, info->name))
+                       continue;
+
+               if (!tapdisk_disk_drivers[i])
+                       return -ENOSYS;
+
+               return i;
+       }
+
+       return -ENOENT;
+}
+
+int
+tapdisk_disktype_parse_params(const char *params, const char **_path)
+{
+       char name[DISK_TYPE_NAME_MAX], *ptr;
+       size_t len;
+       int type;
+
+       ptr = strchr(params, ':');
+       if (!ptr)
+               return -EINVAL;
+
+       len = ptr - params;
+
+       if (len > sizeof(name) - 1)
+               return -ENAMETOOLONG;
+
+       memset(name, 0, sizeof(name));
+       strncpy(name, params, len);
+
+       type = tapdisk_disktype_find(name);
+
+       if (type >= 0)
+               *_path = params + len + 1;
+
+       return type;
+}
+
+int
+tapdisk_parse_disk_type(const char *params, const char **_path, int *_type)
+{
+       int type;
+
+       type = tapdisk_disktype_parse_params(params, _path);
+       if (type < 0)
+               return type;
+
+       *_type = type;
+
+       return 0;
+}
diff --git a/tools/blktap2/drivers/tapdisk-disktype.h b/tools/blktap2/drivers/tapdisk-disktype.h
new file mode 100644 (file)
index 0000000..b697eea
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2007, 2010, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DISKTYPES_H__
+#define __DISKTYPES_H__
+
+#define DISK_TYPE_AIO         0
+#define DISK_TYPE_SYNC        1
+#define DISK_TYPE_VMDK        2
+#define DISK_TYPE_VHDSYNC     3
+#define DISK_TYPE_VHD         4
+#define DISK_TYPE_RAM         5
+#define DISK_TYPE_QCOW        6
+#define DISK_TYPE_BLOCK_CACHE 7
+#define DISK_TYPE_LOG         8
+#define DISK_TYPE_REMUS       9
+#define DISK_TYPE_VINDEX      10
+
+#define DISK_TYPE_NAME_MAX    32
+
+typedef struct disk_info {
+       const char     *name; /* driver name, e.g. 'aio' */
+       char           *desc;  /* e.g. "raw image" */
+       unsigned int    flags; 
+} disk_info_t;
+
+extern const disk_info_t     *tapdisk_disk_types[];
+extern const struct tap_disk *tapdisk_disk_drivers[];
+
+/* one single controller for all instances of disk type */
+#define DISK_TYPE_SINGLE_CONTROLLER (1<<0)
+
+int tapdisk_disktype_find(const char *name);
+int tapdisk_disktype_parse_params(const char *params, const char **_path);
+int tapdisk_parse_disk_type(const char *, const char **, int *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-driver.c b/tools/blktap2/drivers/tapdisk-driver.c
new file mode 100644 (file)
index 0000000..aa1ed15
--- /dev/null
@@ -0,0 +1,101 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdlib.h>
+
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-disktype.h"
+
+td_driver_t *
+tapdisk_driver_allocate(int type, char *name, td_flag_t flags, int storage)
+{
+       int err;
+       td_driver_t *driver;
+       const struct tap_disk *ops;
+
+       ops = tapdisk_disk_drivers[type];
+       if (!ops)
+               return NULL;
+
+       driver = calloc(1, sizeof(td_driver_t));
+       if (!driver)
+               return NULL;
+
+       err = tapdisk_namedup(&driver->name, name);
+       if (err)
+               goto fail;
+
+       driver->ops     = ops;
+       driver->type    = type;
+       driver->storage = storage;
+       driver->data    = calloc(1, ops->private_data_size);
+       if (!driver->data)
+               goto fail;
+
+       if (td_flag_test(flags, TD_OPEN_RDONLY))
+               td_flag_set(driver->state, TD_DRIVER_RDONLY);
+
+       return driver;
+
+fail:
+       free(driver->name);
+       free(driver->data);
+       free(driver);
+       return NULL;
+}
+
+void
+tapdisk_driver_free(td_driver_t *driver)
+{
+       if (!driver)
+               return;
+
+       if (driver->refcnt)
+               return;
+
+       if (td_flag_test(driver->state, TD_DRIVER_OPEN))
+               EPRINTF("freeing open driver %s (state 0x%08x)\n",
+                       driver->name, driver->state);
+
+       free(driver->name);
+       free(driver->data);
+       free(driver);
+}
+
+void
+tapdisk_driver_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb)
+{
+       tapdisk_server_queue_tiocb(tiocb);
+}
+
+void
+tapdisk_driver_debug(td_driver_t *driver)
+{
+       if (driver->ops->td_debug)
+               driver->ops->td_debug(driver);
+}
diff --git a/tools/blktap2/drivers/tapdisk-driver.h b/tools/blktap2/drivers/tapdisk-driver.h
new file mode 100644 (file)
index 0000000..42de05d
--- /dev/null
@@ -0,0 +1,62 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_DRIVER_H_
+#define _TAPDISK_DRIVER_H_
+
+#include "tapdisk.h"
+#include "scheduler.h"
+#include "tapdisk-queue.h"
+
+#define TD_DRIVER_OPEN               0x0001
+#define TD_DRIVER_RDONLY             0x0002
+
+struct td_driver_handle {
+       int                          type;
+       char                        *name;
+
+       int                          storage;
+
+       int                          refcnt;
+       td_flag_t                    state;
+
+       td_disk_info_t               info;
+
+       void                        *data;
+       const struct tap_disk       *ops;
+
+       struct list_head             next;
+};
+
+td_driver_t *tapdisk_driver_allocate(int, char *, td_flag_t, int);
+void tapdisk_driver_free(td_driver_t *);
+
+void tapdisk_driver_queue_tiocb(td_driver_t *, struct tiocb *);
+
+void tapdisk_driver_debug(td_driver_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-filter.c b/tools/blktap2/drivers/tapdisk-filter.c
new file mode 100644 (file)
index 0000000..dcd1802
--- /dev/null
@@ -0,0 +1,272 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <libaio.h>
+#include <syslog.h>
+#include <sys/time.h>
+
+#include "tapdisk-log.h"
+#include "tapdisk-filter.h"
+
+#define RSEED      7
+#define PRE_CHECK  0
+#define POST_CHECK 1
+
+#define WRITE_INTEGRITY   "buffer integrity failure after write"
+#define READ_INTEGRITY    "disk integrity failure after read"
+
+#define DBG(f, a...) tlog_write(TLOG_WARN, f, ##a)
+
+/*
+ * simulate IO errors by knocking request size to zero before
+ * submitting and restoring original size before returning
+ */
+static inline void
+inject_fault(struct tfilter *filter, struct iocb *io)
+{
+       struct fiocb *fio;
+
+       if (!filter->ffree)
+               return;
+
+       fio = filter->flist[--filter->ffree];
+
+       fio->bytes     = io->u.c.nbytes;
+       fio->data      = io->data;
+       io->u.c.nbytes = 0;
+       io->data       = fio;
+}
+
+static inline int
+fault_injected(struct tfilter *filter, struct iocb *io)
+{
+       unsigned long iop   = (unsigned long)io->data;
+       unsigned long start = (unsigned long)filter->fiocbs;
+       unsigned long end   = start + (filter->iocbs * sizeof(struct fiocb));
+
+       return (iop >= start && iop < end);
+}
+
+static inline void
+recover_fault(struct tfilter *filter, struct iocb *io)
+{
+       struct fiocb *fio = (struct fiocb *)io->data;
+
+       io->u.c.nbytes = fio->bytes;
+       io->data       = fio->data;
+
+       memset(fio, 0, sizeof(struct fiocb));
+       filter->flist[filter->ffree++] = fio;
+}
+
+static inline uint64_t
+chksum(char *buf)
+{
+       int i, num   = 512 >> 3;
+       uint64_t *p  = (uint64_t *)buf;
+       uint64_t sum = 0;
+
+       for (i = 0; i < num; i++)
+               sum += p[i];
+
+       return sum;
+}
+
+static inline void
+check_hash(struct tfilter *filter, uint64_t sec, char *buf, char *type)
+{
+       uint64_t sum;
+       struct dhash *hash;
+
+       hash = filter->dhash + sec;
+       if (!hash->time.tv_sec)
+               return;
+
+       sum = chksum(buf);
+       if (hash->hash != chksum(buf)) {
+               struct timeval now;
+               gettimeofday(&now, NULL);
+               DBG("%s: hash table: 0x%020" PRIx64 " at %012lu.%06llu, "
+                   "from disk: 0x%020" PRIx64 " at %012lu.%06llu\n",
+                   type, hash->hash, hash->time.tv_sec,
+                   (unsigned long long)hash->time.tv_usec, sum,
+                   now.tv_sec, (unsigned long long)now.tv_usec);
+       }
+}
+
+static inline void
+insert_hash(struct tfilter *filter, uint64_t sec, char *buf)
+{
+       struct dhash *hash;
+
+       hash = filter->dhash + sec;
+       hash->hash = chksum(buf);
+       gettimeofday(&hash->time, NULL);
+}
+
+static void
+check_sector(struct tfilter *filter, int type, int rw, uint64_t sec, char *buf)
+{
+       struct dhash *hash;
+
+       if (sec >= filter->secs)
+               return;
+
+       hash = filter->dhash + sec;
+
+       if (rw) {
+               if (type == PRE_CHECK)
+                       insert_hash(filter, sec, buf);
+               else
+                       check_hash(filter, sec, buf, WRITE_INTEGRITY);
+       } else if (type == POST_CHECK) {
+               check_hash(filter, sec, buf, READ_INTEGRITY);
+               insert_hash(filter, sec, buf);
+       }
+}
+
+static void
+check_data(struct tfilter *filter, int type, struct iocb *io)
+{
+       int rw;
+       uint64_t i, sec;
+
+       rw = (io->aio_lio_opcode == IO_CMD_PWRITE);
+
+       for (i = 0; i < io->u.c.nbytes; i += 512) {
+               char *buf    = io->u.c.buf + i;
+               uint64_t sec = (io->u.c.offset + i) >> 9;
+               check_sector(filter, type, rw, sec, buf);
+       }
+}
+
+struct tfilter *
+tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs)
+{
+       int i;
+       struct tfilter *filter = NULL;
+
+       if (!mode)
+               return NULL;
+
+       filter = calloc(1, sizeof(struct tfilter));
+       if (!filter)
+               goto fail;
+
+       filter->mode  = mode;
+       filter->secs  = secs;
+       filter->iocbs = iocbs;
+
+       if (filter->mode & TD_INJECT_FAULTS) {
+               filter->fiocbs = calloc(iocbs, sizeof(struct fiocb));
+               filter->flist  = calloc(iocbs, sizeof(struct fiocb *));
+               if (!filter->fiocbs || !filter->flist)
+                       filter->mode &= ~TD_INJECT_FAULTS;
+               else {
+                       srand(RSEED);
+                       filter->ffree = iocbs;
+                       for (i = 0; i < iocbs; i++)
+                               filter->flist[i] = filter->fiocbs + i;
+               }
+       }
+
+       if (filter->mode & TD_CHECK_INTEGRITY) {
+               filter->dhash = calloc(secs, sizeof(struct dhash));
+               if (!filter->dhash)
+                       filter->mode &= ~TD_CHECK_INTEGRITY;
+       }
+
+       syslog(LOG_WARNING, "WARNING: "
+              "FILTERING IN MODE 0x%04x\n", filter->mode);
+
+       return filter;
+
+ fail:
+       tapdisk_free_tfilter(filter);
+       return NULL;
+}
+
+void
+tapdisk_free_tfilter(struct tfilter *filter)
+{
+       if (!filter)
+               return;
+
+       free(filter->dhash);
+       free(filter->flist);
+       free(filter->fiocbs);
+       free(filter);
+}
+
+void
+tapdisk_filter_iocbs(struct tfilter *filter, struct iocb **iocbs, int num)
+{
+       int i;
+
+       if (!filter)
+               return;
+
+       for (i = 0; i < num; i++) {
+               struct iocb *io = iocbs[i];
+
+               if (filter->mode & TD_INJECT_FAULTS) {
+                       if ((random() % 100) <= TD_FAULT_RATE) {
+                               inject_fault(filter, io);
+                               continue;
+                       }
+               }
+
+               if (filter->mode & TD_CHECK_INTEGRITY)
+                       check_data(filter, PRE_CHECK, io);
+       }
+}
+
+void
+tapdisk_filter_events(struct tfilter *filter, struct io_event *events, int num)
+{
+       int i;
+
+       if (!filter)
+               return;
+
+       for (i = 0; i < num; i++) {
+               struct iocb *io = events[i].obj;
+
+               if (filter->mode & TD_INJECT_FAULTS) {
+                       if (fault_injected(filter, io)) {
+                               recover_fault(filter, io);
+                               continue;
+                       }
+               }
+
+               if (filter->mode & TD_CHECK_INTEGRITY)
+                       check_data(filter, POST_CHECK, io);
+       }
+}
diff --git a/tools/blktap2/drivers/tapdisk-filter.h b/tools/blktap2/drivers/tapdisk-filter.h
new file mode 100644 (file)
index 0000000..c4e977e
--- /dev/null
@@ -0,0 +1,67 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef TAPDISK_FILTER_H
+#define TAPDISK_FILTER_H
+
+#include <libaio.h>
+#include <inttypes.h>
+#include <time.h>
+
+#define TD_INJECT_FAULTS     0x00001  /* simulate random IO failures */
+#define TD_CHECK_INTEGRITY   0x00002  /* check data integrity */
+
+#define TD_FAULT_RATE        5
+
+struct dhash {
+       uint64_t             hash;
+       struct timeval       time;
+};
+
+struct fiocb {
+       size_t               bytes;
+       void                *data;
+};
+
+struct tfilter {
+       int                  mode;
+       uint64_t             secs;
+       int                  iocbs;
+
+       struct dhash        *dhash;
+
+       int                  ffree;
+       struct fiocb        *fiocbs;
+       struct fiocb       **flist;
+};
+
+struct tfilter *tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs);
+void tapdisk_free_tfilter(struct tfilter *);
+void tapdisk_filter_iocbs(struct tfilter *, struct iocb **, int);
+void tapdisk_filter_events(struct tfilter *, struct io_event *, int);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-image.c b/tools/blktap2/drivers/tapdisk-image.c
new file mode 100644 (file)
index 0000000..042cdd5
--- /dev/null
@@ -0,0 +1,169 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#ifdef MEMSHR
+#include <memshr.h>
+#endif
+
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+td_image_t *
+tapdisk_image_allocate(const char *file, int type, int storage,
+                      td_flag_t flags, void *private)
+{
+       int err;
+       td_image_t *image;
+
+       image = calloc(1, sizeof(td_image_t));
+       if (!image)
+               return NULL;
+
+       err = tapdisk_namedup(&image->name, file);
+       if (err) {
+               free(image);
+               return NULL;
+       }
+
+       image->type      = type;
+       image->flags     = flags;
+       image->storage   = storage;
+       image->private   = private;
+#ifdef MEMSHR
+       image->memshr_id = memshr_vbd_image_get(file);
+#endif
+       INIT_LIST_HEAD(&image->next);
+
+       return image;
+}
+
+void
+tapdisk_image_free(td_image_t *image)
+{
+       if (!image)
+               return;
+
+       list_del(&image->next);
+
+#ifdef MEMSHR
+       memshr_vbd_image_put(image->memshr_id);
+#endif
+       free(image->name);
+       tapdisk_driver_free(image->driver);
+       free(image);
+}
+
+int
+tapdisk_image_check_td_request(td_image_t *image, td_request_t treq)
+{
+       int rdonly;
+       td_driver_t *driver;
+       td_disk_info_t *info;
+
+       driver = image->driver;
+       if (!driver)
+               return -ENODEV;
+
+       info   = &driver->info;
+       rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY);
+
+       if (treq.op != TD_OP_READ && treq.op != TD_OP_WRITE)
+               goto fail;
+
+       if (treq.op == TD_OP_WRITE && rdonly)
+               goto fail;
+
+       if (treq.secs <= 0 || treq.sec + treq.secs > info->size)
+               goto fail;
+
+       return 0;
+
+fail:
+       ERR(-EINVAL, "bad td request on %s (%s, %"PRIu64"): %d at %"PRIu64,
+           image->name, (rdonly ? "ro" : "rw"), info->size, treq.op,
+           treq.sec + treq.secs);
+       return -EINVAL;
+
+}
+
+int
+tapdisk_image_check_ring_request(td_image_t *image, blkif_request_t *req)
+{
+       td_driver_t *driver;
+       td_disk_info_t *info;
+       int i, psize, rdonly;
+       uint64_t nsects, total;
+
+       driver = image->driver;
+       if (!driver)
+               return -ENODEV;
+
+       nsects = 0;
+       total  = 0;
+       info   = &driver->info;
+
+       rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY);
+
+       if (req->operation != BLKIF_OP_READ &&
+           req->operation != BLKIF_OP_WRITE)
+               goto fail;
+
+       if (req->operation == BLKIF_OP_WRITE && rdonly)
+               goto fail;
+
+       if (!req->nr_segments || req->nr_segments > MAX_SEGMENTS_PER_REQ)
+               goto fail;
+
+       total = 0;
+       psize = getpagesize();
+
+       for (i = 0; i < req->nr_segments; i++) {
+               nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1;
+               
+               if (req->seg[i].last_sect >= psize >> 9 || nsects <= 0)
+                       goto fail;
+
+               total += nsects;
+       }
+
+       if (req->sector_number + nsects > info->size)
+               goto fail;
+
+       return 0;
+
+fail:
+       ERR(-EINVAL, "bad request on %s (%s, %"PRIu64"): id: %"PRIu64": %d at %"PRIu64,
+           image->name, (rdonly ? "ro" : "rw"), info->size, req->id,
+           req->operation, req->sector_number + total);
+       return -EINVAL;
+}
diff --git a/tools/blktap2/drivers/tapdisk-image.h b/tools/blktap2/drivers/tapdisk-image.h
new file mode 100644 (file)
index 0000000..60375ae
--- /dev/null
@@ -0,0 +1,56 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_IMAGE_H_
+#define _TAPDISK_IMAGE_H_
+
+#include "tapdisk.h"
+#include <xen/io/blkif.h>
+
+struct td_image_handle {
+       int                          type;
+       char                        *name;
+    uint16_t                     memshr_id;
+
+       td_flag_t                    flags;
+       int                          storage;
+
+       td_driver_t                 *driver;
+       td_disk_info_t               info;
+
+       void                        *private;
+
+       struct list_head             next;
+};
+
+td_image_t *tapdisk_image_allocate(const char *, int, int, td_flag_t, void *);
+void tapdisk_image_free(td_image_t *);
+
+int tapdisk_image_check_td_request(td_image_t *, td_request_t);
+int tapdisk_image_check_ring_request(td_image_t *, blkif_request_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-interface.c b/tools/blktap2/drivers/tapdisk-interface.c
new file mode 100644 (file)
index 0000000..2e51883
--- /dev/null
@@ -0,0 +1,259 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+
+#include "tapdisk.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+int
+td_load(td_image_t *image)
+{
+       int err;
+       td_image_t *shared;
+       td_driver_t *driver;
+
+       shared = tapdisk_server_get_shared_image(image);
+       if (!shared)
+               return -ENODEV;
+
+       driver = shared->driver;
+       if (!driver)
+               return -EBADF;
+
+       driver->refcnt++;
+       image->driver = driver;
+       image->info   = driver->info;
+
+       DPRINTF("loaded shared image %s (%d users, state: 0x%08x, type: %d)\n",
+               driver->name, driver->refcnt, driver->state, driver->type);
+       return 0;
+}
+
+int
+__td_open(td_image_t *image, td_disk_info_t *info)
+{
+       int err;
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver) {
+               driver = tapdisk_driver_allocate(image->type,
+                                                image->name,
+                                                image->flags,
+                                                image->storage);
+               if (!driver)
+                       return -ENOMEM;
+
+               if (info) /* pre-seed driver->info for virtual drivers */
+                       driver->info = *info;
+       }
+
+       if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+               err = driver->ops->td_open(driver, image->name, image->flags);
+               if (err) {
+                       if (!image->driver)
+                               tapdisk_driver_free(driver);
+                       return err;
+               }
+
+               td_flag_set(driver->state, TD_DRIVER_OPEN);
+               DPRINTF("opened image %s (%d users, state: 0x%08x, type: %d)\n",
+                       driver->name, driver->refcnt + 1,
+                       driver->state, driver->type);
+       }
+
+       image->driver = driver;
+       image->info   = driver->info;
+       driver->refcnt++;
+       return 0;
+}
+
+int
+td_open(td_image_t *image)
+{
+       return __td_open(image, NULL);
+}
+
+int
+td_close(td_image_t *image)
+{
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver)
+               return -ENODEV;
+
+       driver->refcnt--;
+       if (!driver->refcnt && td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+               driver->ops->td_close(driver);
+               td_flag_clear(driver->state, TD_DRIVER_OPEN);
+       }
+
+       DPRINTF("closed image %s (%d users, state: 0x%08x, type: %d)\n",
+               driver->name, driver->refcnt, driver->state, driver->type);
+
+       return 0;
+}
+
+int
+td_get_parent_id(td_image_t *image, td_disk_id_t *id)
+{
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver)
+               return -ENODEV;
+
+       if (!td_flag_test(driver->state, TD_DRIVER_OPEN))
+               return -EBADF;
+
+       return driver->ops->td_get_parent_id(driver, id);
+}
+
+int
+td_validate_parent(td_image_t *image, td_image_t *parent)
+{
+       td_driver_t *driver, *pdriver;
+
+       driver  = image->driver;
+       pdriver = parent->driver;
+       if (!driver || !pdriver)
+               return -ENODEV;
+
+       if (!td_flag_test(driver->state, TD_DRIVER_OPEN) ||
+           !td_flag_test(pdriver->state, TD_DRIVER_OPEN))
+               return -EBADF;
+
+       return 0;
+       return driver->ops->td_validate_parent(driver, pdriver, 0);
+}
+
+void
+td_queue_write(td_image_t *image, td_request_t treq)
+{
+       int err;
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver) {
+               err = -ENODEV;
+               goto fail;
+       }
+
+       if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+               err = -EBADF;
+               goto fail;
+       }
+
+       err = tapdisk_image_check_td_request(image, treq);
+       if (err)
+               goto fail;
+
+       driver->ops->td_queue_write(driver, treq);
+       return;
+
+fail:
+       td_complete_request(treq, err);
+}
+
+void
+td_queue_read(td_image_t *image, td_request_t treq)
+{
+       int err;
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver) {
+               err = -ENODEV;
+               goto fail;
+       }
+
+       if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+               err = -EBADF;
+               goto fail;
+       }
+
+       err = tapdisk_image_check_td_request(image, treq);
+       if (err)
+               goto fail;
+
+       driver->ops->td_queue_read(driver, treq);
+       return;
+
+fail:
+       td_complete_request(treq, err);
+}
+
+void
+td_forward_request(td_request_t treq)
+{
+       tapdisk_vbd_forward_request(treq);
+}
+
+void
+td_complete_request(td_request_t treq, int res)
+{
+       ((td_callback_t)treq.cb)(treq, res);
+}
+
+void
+td_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb)
+{
+       tapdisk_driver_queue_tiocb(driver, tiocb);
+}
+
+void
+td_prep_read(struct tiocb *tiocb, int fd, char *buf, size_t bytes,
+            long long offset, td_queue_callback_t cb, void *arg)
+{
+       tapdisk_prep_tiocb(tiocb, fd, 0, buf, bytes, offset, cb, arg);
+}
+
+void
+td_prep_write(struct tiocb *tiocb, int fd, char *buf, size_t bytes,
+             long long offset, td_queue_callback_t cb, void *arg)
+{
+       tapdisk_prep_tiocb(tiocb, fd, 1, buf, bytes, offset, cb, arg);
+}
+
+void
+td_debug(td_image_t *image)
+{
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver || !td_flag_test(driver->state, TD_DRIVER_OPEN))
+
+               return;
+
+       tapdisk_driver_debug(driver);
+}
diff --git a/tools/blktap2/drivers/tapdisk-interface.h b/tools/blktap2/drivers/tapdisk-interface.h
new file mode 100644 (file)
index 0000000..adc4376
--- /dev/null
@@ -0,0 +1,54 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_INTERFACE_H_
+#define _TAPDISK_INTERFACE_H_
+
+#include "tapdisk.h"
+#include "tapdisk-queue.h"
+
+int td_open(td_image_t *);
+int __td_open(td_image_t *, td_disk_info_t *);
+int td_load(td_image_t *);
+int td_close(td_image_t *);
+int td_get_parent_id(td_image_t *, td_disk_id_t *);
+int td_validate_parent(td_image_t *, td_image_t *);
+
+void td_queue_write(td_image_t *, td_request_t);
+void td_queue_read(td_image_t *, td_request_t);
+void td_forward_request(td_request_t);
+void td_complete_request(td_request_t, int);
+
+void td_debug(td_image_t *);
+
+void td_queue_tiocb(td_driver_t *, struct tiocb *);
+void td_prep_read(struct tiocb *, int, char *, size_t,
+                 long long, td_queue_callback_t, void *);
+void td_prep_write(struct tiocb *, int, char *, size_t,
+                  long long, td_queue_callback_t, void *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-log.c b/tools/blktap2/drivers/tapdisk-log.c
new file mode 100644 (file)
index 0000000..d14da32
--- /dev/null
@@ -0,0 +1,257 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <syslog.h>
+#include <inttypes.h>
+#include <sys/time.h>
+
+#include "tapdisk-log.h"
+#include "tapdisk-utils.h"
+
+#define MAX_ENTRY_LEN      512
+#define MAX_ERROR_MESSAGES 16
+
+struct error {
+       int            cnt;
+       int            err;
+       char          *func;
+       char           msg[MAX_ENTRY_LEN];
+};
+
+struct ehandle {
+       int            cnt;
+       int            dropped;
+       struct error   errors[MAX_ERROR_MESSAGES];
+};
+
+struct tlog {
+       char          *p;
+       int            size;
+       uint64_t       cnt;
+       char          *buf;
+       int            level;
+       char          *file;
+       int            append;
+};
+
+static struct ehandle tapdisk_err;
+static struct tlog tapdisk_log;
+
+void
+open_tlog(char *file, size_t bytes, int level, int append)
+{
+       tapdisk_log.size = ((bytes + 511) & (~511));
+
+       if (asprintf(&tapdisk_log.file, "%s.%d", file, getpid()) == -1)
+               return;
+
+       if (posix_memalign((void **)&tapdisk_log.buf, 512, tapdisk_log.size)) {
+               free(tapdisk_log.file);
+               tapdisk_log.buf = NULL;
+               return;
+       }
+
+       memset(tapdisk_log.buf, 0, tapdisk_log.size);
+
+       tapdisk_log.p      = tapdisk_log.buf;
+       tapdisk_log.level  = level;
+       tapdisk_log.append = append;
+}
+
+void
+close_tlog(void)
+{
+       if (!tapdisk_log.buf)
+               return;
+
+       if (tapdisk_log.append)
+               tlog_flush();
+
+       free(tapdisk_log.buf);
+       free(tapdisk_log.file);
+
+       memset(&tapdisk_log, 0, sizeof(struct tlog));
+}
+
+void
+__tlog_write(int level, const char *func, const char *fmt, ...)
+{
+       char *buf;
+       va_list ap;
+       struct timeval t;
+       int ret, len, avail;
+
+       if (!tapdisk_log.buf)
+               return;
+
+       if (level > tapdisk_log.level)
+               return;
+
+       avail = tapdisk_log.size - (tapdisk_log.p - tapdisk_log.buf);
+       if (avail < MAX_ENTRY_LEN) {
+               if (tapdisk_log.append)
+                       tlog_flush();
+               tapdisk_log.p = tapdisk_log.buf;
+       }
+
+       buf = tapdisk_log.p;
+       gettimeofday(&t, NULL);
+       len = snprintf(buf, MAX_ENTRY_LEN - 1, "%08"PRIu64":%010ld.%06lld:"
+                      "%s ", tapdisk_log.cnt,
+                       t.tv_sec, (unsigned long long)t.tv_usec, func);
+
+       va_start(ap, fmt);
+       ret = vsnprintf(buf + len, MAX_ENTRY_LEN - (len + 1), fmt, ap);
+       va_end(ap);
+
+       len = (ret < MAX_ENTRY_LEN - (len + 1) ?
+              len + ret : MAX_ENTRY_LEN - 1);
+       buf[len] = '\0';
+
+       tapdisk_log.cnt++;
+       tapdisk_log.p += len;
+}
+
+void
+__tlog_error(int err, const char *func, const char *fmt, ...)
+{
+       va_list ap;
+       int i, len, ret;
+       struct error *e;
+       struct timeval t;
+
+       err = (err > 0 ? err : -err);
+
+       for (i = 0; i < tapdisk_err.cnt; i++) {
+               e = &tapdisk_err.errors[i];
+               if (e->err == err && e->func == func) {
+                       e->cnt++;
+                       return;
+               }
+       }
+
+       if (tapdisk_err.cnt >= MAX_ERROR_MESSAGES) {
+               tapdisk_err.dropped++;
+               return;
+       }
+
+       gettimeofday(&t, NULL);
+       e = &tapdisk_err.errors[tapdisk_err.cnt];
+
+       len = snprintf(e->msg, MAX_ENTRY_LEN - 1, "%010ld.%06lld:%s ",
+                      t.tv_sec, (unsigned long long)t.tv_usec, func);
+
+       va_start(ap, fmt);
+       ret = vsnprintf(e->msg + len, MAX_ENTRY_LEN - (len + 1), fmt, ap);
+       va_end(ap);
+
+       len = (ret < MAX_ENTRY_LEN - (len + 1) ?
+              len + ret : MAX_ENTRY_LEN - 1);
+       e->msg[len] = '\0';
+
+       e->cnt++;
+       e->err  = err;
+       e->func = (char *)func;
+       tapdisk_err.cnt++;
+}
+
+void
+tlog_print_errors(void)
+{
+       int i;
+       struct error *e;
+
+       for (i = 0; i < tapdisk_err.cnt; i++) {
+               e = &tapdisk_err.errors[i];
+               syslog(LOG_INFO, "TAPDISK ERROR: errno %d at %s (cnt = %d): "
+                      "%s\n", e->err, e->func, e->cnt, e->msg);
+       }
+
+       if (tapdisk_err.dropped)
+               syslog(LOG_INFO, "TAPDISK ERROR: %d other error messages "
+                      "dropped\n", tapdisk_err.dropped);
+}
+
+void
+tlog_flush_errors(void)
+{
+       int i;
+       struct error *e;
+
+       for (i = 0; i < tapdisk_err.cnt; i++) {
+               e = &tapdisk_err.errors[i];
+               tlog_write(TLOG_WARN, "TAPDISK ERROR: errno %d at %s "
+                          "(cnt = %d): %s\n", e->err, e->func, e->cnt,
+                          e->msg);
+       }
+
+       if (tapdisk_err.dropped)
+               tlog_write(TLOG_WARN, "TAPDISK ERROR: %d other error messages "
+                      "dropped\n", tapdisk_err.dropped);
+}
+
+void
+tlog_flush(void)
+{
+       int fd, flags;
+       size_t size, wsize;
+
+       if (!tapdisk_log.buf)
+               return;
+
+       flags = O_CREAT | O_WRONLY | O_DIRECT | O_NONBLOCK;
+       if (!tapdisk_log.append)
+               flags |= O_TRUNC;
+
+       fd = open(tapdisk_log.file, flags, 0644);
+       if (fd == -1)
+               return;
+
+       if (tapdisk_log.append)
+               if (lseek(fd, 0, SEEK_END) == (off_t)-1)
+                       goto out;
+
+       tlog_flush_errors();
+
+       size  = tapdisk_log.p - tapdisk_log.buf;
+       wsize = ((size + 511) & (~511));
+
+       memset(tapdisk_log.buf + size, '\n', wsize - size);
+       write_exact(fd, tapdisk_log.buf, wsize);
+
+       tapdisk_log.p = tapdisk_log.buf;
+
+out:
+       close(fd);
+}
diff --git a/tools/blktap2/drivers/tapdisk-log.h b/tools/blktap2/drivers/tapdisk-log.h
new file mode 100644 (file)
index 0000000..ae2a408
--- /dev/null
@@ -0,0 +1,51 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_LOG_H_
+#define _TAPDISK_LOG_H_
+
+#define TLOG_WARN       0
+#define TLOG_INFO       1
+#define TLOG_DBG        2
+
+void open_tlog(char *file, size_t bytes, int level, int append);
+void close_tlog(void);
+void tlog_flush(void);
+void tlog_print_errors(void);
+
+void __tlog_write(int level, const char *func, const char *fmt, ...)
+  __attribute__((format(printf, 3, 4)));
+void __tlog_error(int err, const char *func, const char *fmt, ...)
+  __attribute__((format(printf, 3, 4)));
+
+#define tlog_write(_level, _f, _a...)                  \
+       __tlog_write(_level, __func__, _f, ##_a)
+
+#define tlog_error(_err, _f, _a...)                    \
+       __tlog_error(_err, __func__, _f, ##_a)
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-queue.c b/tools/blktap2/drivers/tapdisk-queue.c
new file mode 100644 (file)
index 0000000..1a94038
--- /dev/null
@@ -0,0 +1,743 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libaio.h>
+#ifdef __linux__
+#include <linux/version.h>
+#endif
+
+#include "tapdisk.h"
+#include "tapdisk-log.h"
+#include "tapdisk-queue.h"
+#include "tapdisk-filter.h"
+#include "tapdisk-server.h"
+#include "tapdisk-utils.h"
+
+#include "libaio-compat.h"
+#include "atomicio.h"
+
+#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+/*
+ * We used a kernel patch to return an fd associated with the AIO context
+ * so that we can concurrently poll on synchronous and async descriptors.
+ * This is signalled by passing 1 as the io context to io_setup.
+ */
+#define REQUEST_ASYNC_FD ((io_context_t)1)
+
+static inline void
+queue_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+       struct iocb *iocb = &tiocb->iocb;
+
+       if (queue->queued) {
+               struct tiocb *prev = (struct tiocb *)
+                       queue->iocbs[queue->queued - 1]->data;
+               prev->next = tiocb;
+       }
+
+       queue->iocbs[queue->queued++] = iocb;
+}
+
+static inline int
+deferred_tiocbs(struct tqueue *queue)
+{
+       return (queue->deferred.head != NULL);
+}
+
+static inline void
+defer_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+       struct tlist *list = &queue->deferred;
+
+       if (!list->head)
+               list->head = list->tail = tiocb;
+       else
+               list->tail = list->tail->next = tiocb;
+
+       queue->tiocbs_deferred++;
+       queue->deferrals++;
+}
+
+static inline void
+queue_deferred_tiocb(struct tqueue *queue)
+{
+       struct tlist *list = &queue->deferred;
+
+       if (list->head) {
+               struct tiocb *tiocb = list->head;
+
+               list->head = tiocb->next;
+               if (!list->head)
+                       list->tail = NULL;
+
+               queue_tiocb(queue, tiocb);
+               queue->tiocbs_deferred--;
+       }
+}
+
+static inline void
+queue_deferred_tiocbs(struct tqueue *queue)
+{
+       while (!tapdisk_queue_full(queue) && deferred_tiocbs(queue))
+               queue_deferred_tiocb(queue);
+}
+
+/*
+ * td_complete may queue more tiocbs
+ */
+static void
+complete_tiocb(struct tqueue *queue, struct tiocb *tiocb, unsigned long res)
+{
+       int err;
+       struct iocb *iocb = &tiocb->iocb;
+
+       if (res == iocb->u.c.nbytes)
+               err = 0;
+       else if ((int)res < 0)
+               err = (int)res;
+       else
+               err = -EIO;
+
+       tiocb->cb(tiocb->arg, tiocb, err);
+}
+
+static int
+cancel_tiocbs(struct tqueue *queue, int err)
+{
+       int queued;
+       struct tiocb *tiocb;
+
+       if (!queue->queued)
+               return 0;
+
+       /* 
+        * td_complete may queue more tiocbs, which
+        * will overwrite the contents of queue->iocbs.
+        * use a private linked list to keep track
+        * of the tiocbs we're cancelling. 
+        */
+       tiocb  = queue->iocbs[0]->data;
+       queued = queue->queued;
+       queue->queued = 0;
+
+       for (; tiocb != NULL; tiocb = tiocb->next)
+               complete_tiocb(queue, tiocb, err);
+
+       return queued;
+}
+
+static int
+fail_tiocbs(struct tqueue *queue, int succeeded, int total, int err)
+{
+       ERR(err, "io_submit error: %d of %d failed",
+           total - succeeded, total);
+
+       /* take any non-submitted, merged iocbs 
+        * off of the queue, split them, and fail them */
+       queue->queued = io_expand_iocbs(&queue->opioctx,
+                                       queue->iocbs, succeeded, total);
+
+       return cancel_tiocbs(queue, err);
+}
+
+/*
+ * rwio
+ */
+
+struct rwio {
+       struct io_event *aio_events;
+};
+
+static void
+tapdisk_rwio_destroy(struct tqueue *queue)
+{
+       struct rwio *rwio = queue->tio_data;
+
+       if (rwio->aio_events) {
+               free(rwio->aio_events);
+               rwio->aio_events = NULL;
+       }
+}
+
+static int
+tapdisk_rwio_setup(struct tqueue *queue, int size)
+{
+       struct rwio *rwio = queue->tio_data;
+       int err;
+
+       rwio->aio_events = calloc(size, sizeof(struct io_event));
+       if (!rwio->aio_events)
+               return -errno;
+
+       return 0;
+}
+
+static inline ssize_t
+tapdisk_rwio_rw(const struct iocb *iocb)
+{
+       int fd        = iocb->aio_fildes;
+       char *buf     = iocb->u.c.buf;
+       long long off = iocb->u.c.offset;
+       size_t size   = iocb->u.c.nbytes;
+       ssize_t (*func)(int, void *, size_t) = 
+               (iocb->aio_lio_opcode == IO_CMD_PWRITE ? vwrite : read);
+
+       if (lseek(fd, off, SEEK_SET) == (off_t)-1)
+               return -errno;
+
+       if (atomicio(func, fd, buf, size) != size)
+               return -errno;
+
+       return size;
+}
+
+static int
+tapdisk_rwio_submit(struct tqueue *queue)
+{
+       struct rwio *rwio = queue->tio_data;
+       int i, merged, split;
+       struct iocb *iocb;
+       struct tiocb *tiocb;
+       struct io_event *ep;
+
+       if (!queue->queued)
+               return 0;
+
+       tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued);
+       merged = io_merge(&queue->opioctx, queue->iocbs, queue->queued);
+
+       queue->queued = 0;
+
+       for (i = 0; i < merged; i++) {
+               ep      = rwio->aio_events + i;
+               iocb    = queue->iocbs[i];
+               ep->obj = iocb;
+               ep->res = tapdisk_rwio_rw(iocb);
+       }
+
+       split = io_split(&queue->opioctx, rwio->aio_events, merged);
+       tapdisk_filter_events(queue->filter, rwio->aio_events, split);
+
+       for (i = split, ep = rwio->aio_events; i-- > 0; ep++) {
+               iocb  = ep->obj;
+               tiocb = iocb->data;
+               complete_tiocb(queue, tiocb, ep->res);
+       }
+
+       queue_deferred_tiocbs(queue);
+
+       return split;
+}
+
+static const struct tio td_tio_rwio = {
+       .name        = "rwio",
+       .data_size   = 0,
+       .tio_setup   = NULL,
+       .tio_destroy = NULL,
+       .tio_submit  = tapdisk_rwio_submit
+};
+
+/*
+ * libaio
+ */
+
+struct lio {
+       io_context_t     aio_ctx;
+       struct io_event *aio_events;
+
+       int              event_fd;
+       int              event_id;
+
+       int              flags;
+};
+
+#define LIO_FLAG_EVENTFD        (1<<0)
+
+static int
+tapdisk_lio_check_resfd(void)
+{
+#if defined(__linux__)
+       return tapdisk_linux_version() >= KERNEL_VERSION(2, 6, 22);
+#else
+       return 1;
+#endif
+}
+
+static void
+tapdisk_lio_destroy_aio(struct tqueue *queue)
+{
+       struct lio *lio = queue->tio_data;
+
+       if (lio->event_fd >= 0) {
+               close(lio->event_fd);
+               lio->event_fd = -1;
+       }
+
+       if (lio->aio_ctx) {
+               io_destroy(lio->aio_ctx);
+               lio->aio_ctx = 0;
+       }
+}
+
+static int
+__lio_setup_aio_poll(struct tqueue *queue, int qlen)
+{
+       struct lio *lio = queue->tio_data;
+       int err, fd;
+
+       lio->aio_ctx = REQUEST_ASYNC_FD;
+
+       fd = io_setup(qlen, &lio->aio_ctx);
+       if (fd < 0) {
+               lio->aio_ctx = 0;
+               err = -errno;
+
+               if (err == -EINVAL)
+                       goto fail_fd;
+
+               goto fail;
+       }
+
+       lio->event_fd = fd;
+
+       return 0;
+
+fail_fd:
+       DPRINTF("Couldn't get fd for AIO poll support. This is probably "
+               "because your kernel does not have the aio-poll patch "
+               "applied.\n");
+fail:
+       return err;
+}
+
+static int
+__lio_setup_aio_eventfd(struct tqueue *queue, int qlen)
+{
+       struct lio *lio = queue->tio_data;
+       int err;
+
+       err = io_setup(qlen, &lio->aio_ctx);
+       if (err < 0) {
+               lio->aio_ctx = 0;
+               return err;
+       }
+
+       lio->event_fd = tapdisk_sys_eventfd(0);
+       if (lio->event_fd < 0)
+               return  -errno;
+
+       lio->flags |= LIO_FLAG_EVENTFD;
+
+       return 0;
+}
+
+static int
+tapdisk_lio_setup_aio(struct tqueue *queue, int qlen)
+{
+       struct lio *lio = queue->tio_data;
+       int err;
+
+       lio->aio_ctx  =  0;
+       lio->event_fd = -1;
+
+       /*
+        * prefer the mainline eventfd(2) api, if available.
+        * if not, fall back to the poll fd patch.
+        */
+
+       err = !tapdisk_lio_check_resfd();
+       if (!err)
+               err = __lio_setup_aio_eventfd(queue, qlen);
+       if (err)
+               err = __lio_setup_aio_poll(queue, qlen);
+
+       if (err == -EAGAIN)
+               goto fail_rsv;
+fail:
+       return err;
+
+fail_rsv:
+       DPRINTF("Couldn't setup AIO context. If you are trying to "
+               "concurrently use a large number of blktap-based disks, you may "
+               "need to increase the system-wide aio request limit. "
+               "(e.g. 'echo 1048576 > /proc/sys/fs/aio-max-nr')\n");
+       goto fail;
+}
+
+
+static void
+tapdisk_lio_destroy(struct tqueue *queue)
+{
+       struct lio *lio = queue->tio_data;
+
+       if (!lio)
+               return;
+
+       if (lio->event_id >= 0) {
+               tapdisk_server_unregister_event(lio->event_id);
+               lio->event_id = -1;
+       }
+
+       tapdisk_lio_destroy_aio(queue);
+
+       if (lio->aio_events) {
+               free(lio->aio_events);
+               lio->aio_events = NULL;
+       }
+}
+
+static void
+tapdisk_lio_set_eventfd(struct tqueue *queue, int n, struct iocb **iocbs)
+{
+       struct lio *lio = queue->tio_data;
+       int i;
+
+       if (lio->flags & LIO_FLAG_EVENTFD)
+               for (i = 0; i < n; ++i)
+                       __io_set_eventfd(iocbs[i], lio->event_fd);
+}
+
+static void
+tapdisk_lio_ack_event(struct tqueue *queue)
+{
+       struct lio *lio = queue->tio_data;
+       uint64_t val;
+
+       if (lio->flags & LIO_FLAG_EVENTFD)
+               read_exact(lio->event_fd, &val, sizeof(val));
+}
+
+static void
+tapdisk_lio_event(event_id_t id, char mode, void *private)
+{
+       struct tqueue *queue = private;
+       struct lio *lio;
+       int i, ret, split;
+       struct iocb *iocb;
+       struct tiocb *tiocb;
+       struct io_event *ep;
+
+       tapdisk_lio_ack_event(queue);
+
+       lio   = queue->tio_data;
+       ret   = io_getevents(lio->aio_ctx, 0,
+                            queue->size, lio->aio_events, NULL);
+       split = io_split(&queue->opioctx, lio->aio_events, ret);
+       tapdisk_filter_events(queue->filter, lio->aio_events, split);
+
+       DBG("events: %d, tiocbs: %d\n", ret, split);
+
+       queue->iocbs_pending  -= ret;
+       queue->tiocbs_pending -= split;
+
+       for (i = split, ep = lio->aio_events; i-- > 0; ep++) {
+               iocb  = ep->obj;
+               tiocb = iocb->data;
+               complete_tiocb(queue, tiocb, ep->res);
+       }
+
+       queue_deferred_tiocbs(queue);
+}
+
+static int
+tapdisk_lio_setup(struct tqueue *queue, int qlen)
+{
+       struct lio *lio = queue->tio_data;
+       size_t sz;
+       int err;
+
+       lio->event_id = -1;
+
+       err = tapdisk_lio_setup_aio(queue, qlen);
+       if (err)
+               goto fail;
+
+       lio->event_id =
+               tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                             lio->event_fd, 0,
+                                             tapdisk_lio_event,
+                                             queue);
+       err = lio->event_id;
+       if (err < 0)
+               goto fail;
+
+       lio->aio_events = calloc(qlen, sizeof(struct io_event));
+       if (!lio->aio_events) {
+               err = -errno;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       tapdisk_lio_destroy(queue);
+       return err;
+}
+
+static int
+tapdisk_lio_submit(struct tqueue *queue)
+{
+       struct lio *lio = queue->tio_data;
+       int merged, submitted, err = 0;
+
+       if (!queue->queued)
+               return 0;
+
+       tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued);
+       merged    = io_merge(&queue->opioctx, queue->iocbs, queue->queued);
+       tapdisk_lio_set_eventfd(queue, merged, queue->iocbs);
+       submitted = io_submit(lio->aio_ctx, merged, queue->iocbs);
+
+       DBG("queued: %d, merged: %d, submitted: %d\n",
+           queue->queued, merged, submitted);
+
+       if (submitted < 0) {
+               err = submitted;
+               submitted = 0;
+       } else if (submitted < merged)
+               err = -EIO;
+
+       queue->iocbs_pending  += submitted;
+       queue->tiocbs_pending += queue->queued;
+       queue->queued          = 0;
+
+       if (err)
+               queue->tiocbs_pending -= 
+                       fail_tiocbs(queue, submitted, merged, err);
+
+       return submitted;
+}
+
+static const struct tio td_tio_lio = {
+       .name        = "lio",
+       .data_size   = sizeof(struct lio),
+       .tio_setup   = tapdisk_lio_setup,
+       .tio_destroy = tapdisk_lio_destroy,
+       .tio_submit  = tapdisk_lio_submit,
+};
+
+static void
+tapdisk_queue_free_io(struct tqueue *queue)
+{
+       if (queue->tio) {
+               if (queue->tio->tio_destroy)
+                       queue->tio->tio_destroy(queue);
+               queue->tio = NULL;
+       }
+
+       if (queue->tio_data) {
+               free(queue->tio_data);
+               queue->tio_data = NULL;
+       }
+}
+
+static int
+tapdisk_queue_init_io(struct tqueue *queue, int drv)
+{
+       const struct tio *tio;
+       int err;
+
+       switch (drv) {
+       case TIO_DRV_LIO:
+               tio = &td_tio_lio;
+               break;
+       case TIO_DRV_RWIO:
+               tio = &td_tio_rwio;
+               break;
+       default:
+               err = -EINVAL;
+               goto fail;
+       }
+
+       queue->tio_data = calloc(1, tio->data_size);
+       if (!queue->tio_data) {
+               PERROR("malloc(%zu)", tio->data_size);
+               err = -errno;
+               goto fail;
+       }
+
+       queue->tio = tio;
+
+       if (tio->tio_setup) {
+               err = tio->tio_setup(queue, queue->size);
+               if (err)
+                       goto fail;
+       }
+
+       DPRINTF("I/O queue driver: %s\n", tio->name);
+
+       return 0;
+
+fail:
+       tapdisk_queue_free_io(queue);
+       return err;
+}
+
+int
+tapdisk_init_queue(struct tqueue *queue, int size,
+                  int drv, struct tfilter *filter)
+{
+       int i, err;
+
+       memset(queue, 0, sizeof(struct tqueue));
+
+       queue->size   = size;
+       queue->filter = filter;
+
+       if (!size)
+               return 0;
+
+       err = tapdisk_queue_init_io(queue, drv);
+       if (err)
+               goto fail;
+
+       queue->iocbs = calloc(size, sizeof(struct iocb *));
+       if (!queue->iocbs) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = opio_init(&queue->opioctx, size);
+       if (err)
+               goto fail;
+
+       return 0;
+
+ fail:
+       tapdisk_free_queue(queue);
+       return err;
+}
+
+void
+tapdisk_free_queue(struct tqueue *queue)
+{
+       tapdisk_queue_free_io(queue);
+
+       free(queue->iocbs);
+       queue->iocbs = NULL;
+
+       opio_free(&queue->opioctx);
+}
+
+void 
+tapdisk_debug_queue(struct tqueue *queue)
+{
+       struct tiocb *tiocb = queue->deferred.head;
+
+       WARN("TAPDISK QUEUE:\n");
+       WARN("size: %d, tio: %s, queued: %d, iocbs_pending: %d, "
+            "tiocbs_pending: %d, tiocbs_deferred: %d, deferrals: %"PRIx64"\n",
+            queue->size, queue->tio->name, queue->queued, queue->iocbs_pending,
+            queue->tiocbs_pending, queue->tiocbs_deferred, queue->deferrals);
+
+       if (tiocb) {
+               WARN("deferred:\n");
+               for (; tiocb != NULL; tiocb = tiocb->next) {
+                       struct iocb *io = &tiocb->iocb;
+                       WARN("%s of %lu bytes at %lld\n",
+                            (io->aio_lio_opcode == IO_CMD_PWRITE ?
+                             "write" : "read"),
+                            io->u.c.nbytes, io->u.c.offset);
+               }
+       }
+}
+
+void
+tapdisk_prep_tiocb(struct tiocb *tiocb, int fd, int rw, char *buf, size_t size,
+                  long long offset, td_queue_callback_t cb, void *arg)
+{
+       struct iocb *iocb = &tiocb->iocb;
+
+       if (rw)
+               io_prep_pwrite(iocb, fd, buf, size, offset);
+       else
+               io_prep_pread(iocb, fd, buf, size, offset);
+
+       iocb->data  = tiocb;
+       tiocb->cb   = cb;
+       tiocb->arg  = arg;
+       tiocb->next = NULL;
+}
+
+void
+tapdisk_queue_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+       if (!tapdisk_queue_full(queue))
+               queue_tiocb(queue, tiocb);
+       else
+               defer_tiocb(queue, tiocb);
+}
+
+
+/*
+ * fail_tiocbs may queue more tiocbs
+ */
+int
+tapdisk_submit_tiocbs(struct tqueue *queue)
+{
+       return queue->tio->tio_submit(queue);
+}
+
+int
+tapdisk_submit_all_tiocbs(struct tqueue *queue)
+{
+       int submitted = 0;
+
+       do {
+               submitted += tapdisk_submit_tiocbs(queue);
+       } while (!tapdisk_queue_empty(queue));
+
+       return submitted;
+}
+
+/*
+ * cancel_tiocbs may queue more tiocbs
+ */
+int
+tapdisk_cancel_tiocbs(struct tqueue *queue)
+{
+       return cancel_tiocbs(queue, -EIO);
+}
+
+int
+tapdisk_cancel_all_tiocbs(struct tqueue *queue)
+{
+       int cancelled = 0;
+
+       do {
+               cancelled += tapdisk_cancel_tiocbs(queue);
+       } while (!tapdisk_queue_empty(queue));
+
+       return cancelled;
+}
diff --git a/tools/blktap2/drivers/tapdisk-queue.h b/tools/blktap2/drivers/tapdisk-queue.h
new file mode 100644 (file)
index 0000000..d40e6b3
--- /dev/null
@@ -0,0 +1,125 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef TAPDISK_QUEUE_H
+#define TAPDISK_QUEUE_H
+
+#include <libaio.h>
+
+#include "io-optimize.h"
+#include "scheduler.h"
+
+struct tiocb;
+struct tfilter;
+
+typedef void (*td_queue_callback_t)(void *arg, struct tiocb *, int err);
+
+
+struct tiocb {
+       td_queue_callback_t   cb;
+       void                 *arg;
+
+       struct iocb           iocb;
+       struct tiocb         *next;
+};
+
+struct tlist {
+       struct tiocb         *head;
+       struct tiocb         *tail;
+};
+
+struct tqueue {
+       int                   size;
+
+       const struct tio     *tio;
+       void                 *tio_data;
+
+       struct opioctx        opioctx;
+
+       int                   queued;
+       struct iocb         **iocbs;
+
+       /* number of iocbs pending in the aio layer */
+       int                   iocbs_pending;
+
+       /* number of tiocbs pending in the queue -- 
+        * this is likely to be larger than iocbs_pending 
+        * due to request coalescing */
+       int                   tiocbs_pending;
+
+       /* iocbs may be deferred if the aio ring is full.
+        * tapdisk_queue_complete will ensure deferred
+        * iocbs are queued as slots become available. */
+       struct tlist          deferred;
+       int                   tiocbs_deferred;
+
+       /* optional tapdisk filter */
+       struct tfilter       *filter;
+
+       uint64_t              deferrals;
+};
+
+struct tio {
+       const char           *name;
+       size_t                data_size;
+
+       int  (*tio_setup)    (struct tqueue *queue, int qlen);
+       void (*tio_destroy)  (struct tqueue *queue);
+       int  (*tio_submit)   (struct tqueue *queue);
+};
+
+enum {
+       TIO_DRV_LIO     = 1,
+       TIO_DRV_RWIO    = 2,
+};
+
+/*
+ * Interface for request producer (i.e., tapdisk)
+ * NB: the following functions may cause additional tiocbs to be queued:
+ *        - tapdisk_submit_tiocbs
+ *        - tapdisk_cancel_tiocbs
+ *        - tapdisk_complete_tiocbs
+ * The *_all_tiocbs variants will handle the first two cases;
+ * be sure to call submit after calling complete in the third case.
+ */
+#define tapdisk_queue_count(q) ((q)->queued)
+#define tapdisk_queue_empty(q) ((q)->queued == 0)
+#define tapdisk_queue_full(q)  \
+       (((q)->tiocbs_pending + (q)->queued) >= (q)->size)
+int tapdisk_init_queue(struct tqueue *, int size, int drv, struct tfilter *);
+void tapdisk_free_queue(struct tqueue *);
+void tapdisk_debug_queue(struct tqueue *);
+void tapdisk_queue_tiocb(struct tqueue *, struct tiocb *);
+int tapdisk_submit_tiocbs(struct tqueue *);
+int tapdisk_submit_all_tiocbs(struct tqueue *);
+int tapdisk_cancel_tiocbs(struct tqueue *);
+int tapdisk_cancel_all_tiocbs(struct tqueue *);
+void tapdisk_prep_tiocb(struct tiocb *, int, int, char *, size_t,
+                       long long, td_queue_callback_t, void *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-ring.c b/tools/blktap2/drivers/tapdisk-ring.c
new file mode 100644 (file)
index 0000000..a5d40cb
--- /dev/null
@@ -0,0 +1,439 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+
+#include "tapdisk-ring.h"
+
+static int
+tapdisk_uring_create_ctlfd(td_uring_t *ring)
+{
+       int fd, err;
+       struct sockaddr_un saddr;
+
+       if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_family)) >=
+           sizeof(saddr.sun_family))
+               return -ENAMETOOLONG;
+
+       fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (fd == -1)
+               return -errno;
+
+       memset(&saddr, 0, sizeof(struct sockaddr_un));
+       saddr.sun_family = AF_UNIX;
+       memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path));
+
+       err = unlink(ring->ctlfd_path);
+       if (err == -1 && errno != ENOENT) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = bind(fd, &saddr, sizeof(struct sockaddr_un));
+       if (err == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = listen(fd, 1);
+       if (err == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       ring->ctlfd = fd;
+       return 0;
+
+fail:
+       close(fd);
+       return err;
+}
+
+static void
+tapdisk_uring_destroy_ctlfd(td_uring_t *ring)
+{
+       if (ring->ctlfd) {
+               close(ring->ctlfd);
+               ring->ctlfd = 0;
+       }
+
+       if (ring->ctlfd_path) {
+               unlink(ring->ctlfd_path);
+               free(ring->ctlfd_path);
+               ring->ctlfd_path = NULL;
+       }
+}
+
+static int
+tapdisk_uring_connect_ctlfd(td_uring_t *ring)
+{
+       int fd, err;
+       struct sockaddr_un saddr;
+
+       if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_path)) >=
+           sizeof(saddr.sun_path))
+               return -ENAMETOOLONG;
+
+       fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (fd == -1)
+               return -errno;
+
+       memset(&saddr, 0, sizeof(struct sockaddr_un));
+       saddr.sun_family = AF_UNIX;
+       memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path));
+
+       err = connect(fd, &saddr, sizeof(saddr));
+       if (err == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       ring->ctlfd = fd;
+       return 0;
+
+fail:
+       close(fd);
+       return err;
+}
+
+static void
+tapdisk_uring_disconnect_ctlfd(td_uring_t *ring)
+{
+       if (ring->ctlfd)
+               close(ring->ctlfd);
+       free(ring->ctlfd_path);
+       ring->ctlfd_path = NULL;
+}
+
+static int
+tapdisk_uring_create_shmem(td_uring_t *ring)
+{
+       int fd, err;
+
+       fd = shm_open(ring->shmem_path, O_CREAT | O_RDWR, 0750);
+       if (fd == -1)
+               return -errno;
+
+       err = ftruncate(fd, ring->shmem_size);
+       if (err == -1) {
+               err = -errno;
+               goto out;
+       }
+
+       ring->shmem = mmap(NULL, ring->shmem_size,
+                          PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       if (ring->shmem == MAP_FAILED) {
+               ring->shmem = NULL;
+               err = -errno;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       close(fd);
+       return err;
+}
+
+static void
+tapdisk_uring_destroy_shmem(td_uring_t *ring)
+{
+       if (ring->shmem) {
+               munmap(ring->shmem, ring->shmem_size);
+               ring->shmem = NULL;
+       }
+
+       if (ring->shmem_path) {
+               shm_unlink(ring->shmem_path);
+               free(ring->shmem_path);
+               ring->shmem_path = NULL;
+       }
+}
+
+static int
+tapdisk_uring_connect_shmem(td_uring_t *ring)
+{
+       int fd, err;
+       td_uring_header_t header, *p;
+
+       fd = shm_open(ring->shmem_path, O_RDWR);
+       if (fd == -1)
+               return -errno;
+
+       p = mmap(NULL, sizeof(td_uring_header_t),
+                PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       if (p == MAP_FAILED) {
+               err = -errno;
+               goto out;
+       }
+
+       memcpy(&header, p, sizeof(td_uring_header_t));
+       munmap(p, sizeof(td_uring_header_t));
+
+       if (memcmp(header.cookie,
+                  TAPDISK_URING_COOKIE, sizeof(header.cookie))) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (header.version != TD_URING_CURRENT_VERSION) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       ring->ring_size  = header.ring_size;
+       ring->data_size  = header.data_size;
+       ring->shmem_size = header.shmem_size;
+
+       ring->shmem = mmap(NULL, ring->shmem_size,
+                          PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       if (ring->shmem == MAP_FAILED) {
+               rint->shmem = NULL;
+               err = -errno;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       close(fd);
+       return err;
+}
+
+static void
+tapdisk_uring_disconnect_shmem(td_uring_t *ring)
+{
+       if (ring->shmem)
+               munmap(ring->shmem, ring->shmem_size);
+       free(ring->shmem_path);
+       ring->shmem_path = NULL;
+}
+
+int
+tapdisk_uring_create(td_uring_t *ring, const char *location,
+                   uint32_t ring_size, uint32_t data_size)
+{
+       int fd, err;
+
+       memset(ring, 0, sizeof(td_uring_t));
+
+       ring->ring_size  = ring_size;
+       ring->data_size  = data_size;
+       ring->shmem_size = ring_size + data_size + sizeof(td_uring_header_t);
+
+       err = asprintf(&ring->shmem_path, "%s.shm", location);
+       if (err == -1) {
+               ring->shmem_path = NULL;
+               err = -errno;
+               goto fail;
+       }
+
+       err = asprintf(&ring->ctlfd_path, "%s.cfd", location);
+       if (err == -1) {
+               ring->ctlfd_path = NULL;
+               err = -errno;
+               goto fail;
+       }
+
+       err = tapdisk_uring_create_ctlfd(ring);
+       if (err)
+               goto fail;
+
+       err = tapdisk_uring_create_shmem(ring);
+       if (err)
+               goto fail;
+
+       ring->ring_area = (unsigned long)ring->shmem + sizeof(td_uring_header_t);
+       ring->data_area = (unsigned long)ring->ring_area + ring->ring_size;
+
+       return 0;
+
+fail:
+       tapdisk_uring_destroy(ring);
+       return err;
+}
+
+int
+tapdisk_uring_destroy(td_uring_t *ring)
+{
+       tapdisk_uring_destroy_shmem(ring);
+       tapdisk_uring_destroy_ctlfd(ring);
+       return 0;
+}
+
+int
+tapdisk_uring_connect(td_uring_t *ring, const char *location)
+{
+       int fd, err;
+
+       memset(ring, 0, sizeof(td_uring_t));
+
+       err = asprintf(&ring->shmem_path, "%s.shm", location);
+       if (err == -1) {
+               ring->shmem_path = NULL;
+               err = -errno;
+               goto fail;
+       }
+
+       err = asprintf(&ring->ctlfd_path, "%s.cfd", location);
+       if (err == -1) {
+               ring->ctlfd_path = NULL;
+               err = -errno;
+               goto fail;
+       }
+
+       err = tapdisk_uring_connect_ctlfd(ring);
+       if (err)
+               goto fail;
+
+       err = tapdisk_uring_connect_shmem(ring);
+       if (err)
+               goto fail;
+
+       err = 0;
+
+fail:
+}
+
+int
+tapdisk_uring_disconnect(td_uring_t *ring)
+{
+       tapdisk_uring_disconnect_shmem(ring);
+       tapdisk_uring_disconnect_ctlfd(ring);
+       return 0;
+}
+
+static int
+tapdisk_ring_read_message(int fd, td_uring_message_t *message, int timeout)
+{
+       fd_set readfds;
+       int ret, len, offset;
+       struct timeval tv, *t;
+
+       t      = NULL;
+       offset = 0;
+       len    = sizeof(td_uring_message_t);
+
+       if (timeout) {
+               tv.tv_sec  = timeout;
+               tv.tv_usec = 0;
+               t = &tv;
+       }
+
+       while (offset < len) {
+               FD_ZERO(&readfds);
+               FD_SET(fd, &readfds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(fd + 1, &readfds, NULL, NULL, t);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(fd, &readfds)) {
+                       ret = read(fd, message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len)
+               return -EIO;
+
+       return 0;
+}
+
+static int
+tapdisk_ring_write_message(int fd, td_uring_message_t *message, int timeout)
+{
+       fd_set writefds;
+       int ret, len, offset;
+       struct timeval tv, *t;
+
+       t      = NULL;
+       offset = 0;
+       len    = sizeof(td_uring_message_t);
+
+       if (timeout) {
+               tv.tv_sec  = timeout;
+               tv.tv_usec = 0;
+               t = &tv;
+       }
+
+       while (offset < len) {
+               FD_ZERO(&writefds);
+               FD_SET(fd, &writefds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(fd + 1, NULL, &writefds, NULL, t);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(fd, &writefds)) {
+                       ret = write(fd, message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len)
+               return -EIO;
+
+       return 0;
+}
+
+int
+tapdisk_uring_poll(td_uring_t *ring)
+{
+       int err;
+       td_uring_message_t message;
+
+       err = tapdisk_uring_read_message(ring->ctlfd, &message, 1);
+       if (err)
+               return err;
+
+       if (message.type != TAPDISK_URING_MESSAGE_KICK)
+               return -EINVAL;
+
+       return 0;
+}
+
+int
+tapdisk_uring_kick(td_uring_t *ring)
+{
+       td_uring_message_t message;
+
+       memset(&message, 0, sizeof(td_uring_message_t));
+       message.type = TAPDISK_URING_MESSAGE_KICK;
+
+       return tapdisk_uring_write_message(ring->ctlfd, &message, 1);
+}
diff --git a/tools/blktap2/drivers/tapdisk-ring.h b/tools/blktap2/drivers/tapdisk-ring.h
new file mode 100644 (file)
index 0000000..a70ee10
--- /dev/null
@@ -0,0 +1,87 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_RING_H_
+#define _TAPDISK_RING_H_
+
+#include <inttypes.h>
+
+#include <xenctrl.h>
+#include <xen/io/ring.h>
+
+typedef struct td_uring             td_uring_t;
+typedef struct td_uring_header      td_uring_header_t;
+typedef struct td_uring_request     td_uring_request_t;
+typedef struct td_uring_response    td_uring_response_t;
+
+struct td_uring {
+       int                         ctlfd;
+
+       char                       *shmem_path;
+       char                       *ctlfd_path;
+
+       void                       *shmem;
+       void                       *ring_area;
+       void                       *data_area;
+};
+
+struct td_uring_header {
+       char                        cookie[8];
+       uint32_t                    version;
+       uint32_t                    shmem_size;
+       uint32_t                    ring_size;
+       uint32_t                    data_size;
+       char                        reserved[4064];
+};
+
+struct td_uring_request {
+       uint8_t                     op;
+       uint64_t                    id;
+       uint64_t                    sec;
+       uint32_t                    secs;
+       uint32_t                    offset;
+};
+
+struct td_uring_response {
+       uint8_t                     op;
+       uint64_t                    id;
+       uint8_t                     status;
+};
+
+DEFINE_RING_TYPES(td_uring, td_uring_request_t, td_uring_response_t);
+
+int tapdisk_uring_create(td_uring_t *, const char *location,
+                       uint32_t ring_size, uint32_t data_size);
+int tapdisk_uring_destroy(td_uring_t *);
+
+int tapdisk_uring_connect(td_uring_t *, const char *location);
+int tapdisk_uring_disconnect(td_uring_t *);
+
+int tapdisk_uring_poll(td_uring_t *);
+int tapdisk_uring_kick(td_uring_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-server.c b/tools/blktap2/drivers/tapdisk-server.c
new file mode 100644 (file)
index 0000000..eecde3d
--- /dev/null
@@ -0,0 +1,345 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/signal.h>
+
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define DBG(_level, _f, _a...)       tlog_write(_level, _f, ##_a)
+#define ERR(_err, _f, _a...)         tlog_error(_err, _f, ##_a)
+
+ tapdisk_server_t server;
+
+#define tapdisk_server_for_each_vbd(vbd, tmp)                          \
+       list_for_each_entry_safe(vbd, tmp, &server.vbds, next)
+
+td_image_t *
+tapdisk_server_get_shared_image(td_image_t *image)
+{
+       td_vbd_t *vbd, *tmpv;
+       td_image_t *img, *tmpi;
+
+       if (!td_flag_test(image->flags, TD_OPEN_SHAREABLE))
+               return NULL;
+
+       tapdisk_server_for_each_vbd(vbd, tmpv)
+               tapdisk_vbd_for_each_image(vbd, img, tmpi)
+                       if (img->type == image->type &&
+                           !strcmp(img->name, image->name))
+                               return img;
+
+       return NULL;
+}
+
+struct list_head *
+tapdisk_server_get_all_vbds(void)
+{
+       return &server.vbds;
+}
+
+td_vbd_t *
+tapdisk_server_get_vbd(uint16_t uuid)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               if (vbd->uuid == uuid)
+                       return vbd;
+
+       return NULL;
+}
+
+void
+tapdisk_server_add_vbd(td_vbd_t *vbd)
+{
+       list_add_tail(&vbd->next, &server.vbds);
+}
+
+void
+tapdisk_server_remove_vbd(td_vbd_t *vbd)
+{
+       list_del(&vbd->next);
+       INIT_LIST_HEAD(&vbd->next);
+       tapdisk_server_check_state();
+}
+
+void
+tapdisk_server_queue_tiocb(struct tiocb *tiocb)
+{
+       tapdisk_queue_tiocb(&server.aio_queue, tiocb);
+}
+
+void
+tapdisk_server_debug(void)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_debug_queue(&server.aio_queue);
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_vbd_debug(vbd);
+
+       tlog_flush();
+}
+
+void
+tapdisk_server_check_state(void)
+{
+       if (list_empty(&server.vbds))
+               server.run = 0;
+}
+
+event_id_t
+tapdisk_server_register_event(char mode, int fd,
+                             int timeout, event_cb_t cb, void *data)
+{
+       return scheduler_register_event(&server.scheduler,
+                                       mode, fd, timeout, cb, data);
+}
+
+void
+tapdisk_server_unregister_event(event_id_t event)
+{
+       return scheduler_unregister_event(&server.scheduler, event);
+}
+
+void
+tapdisk_server_set_max_timeout(int seconds)
+{
+       scheduler_set_max_timeout(&server.scheduler, seconds);
+}
+
+static void
+tapdisk_server_assert_locks(void)
+{
+
+}
+
+static void
+tapdisk_server_set_retry_timeout(void)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               if (tapdisk_vbd_retry_needed(vbd)) {
+                       tapdisk_server_set_max_timeout(TD_VBD_RETRY_INTERVAL);
+                       return;
+               }
+}
+
+static void
+tapdisk_server_check_progress(void)
+{
+       struct timeval now;
+       td_vbd_t *vbd, *tmp;
+
+       gettimeofday(&now, NULL);
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_vbd_check_progress(vbd);
+}
+
+static void
+tapdisk_server_submit_tiocbs(void)
+{
+       tapdisk_submit_all_tiocbs(&server.aio_queue);
+}
+
+static void
+tapdisk_server_kick_responses(void)
+{
+       int n;
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_vbd_kick(vbd);
+}
+
+static void
+tapdisk_server_check_vbds(void)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_vbd_check_state(vbd);
+}
+
+static void
+tapdisk_server_stop_vbds(void)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_vbd_kill_queue(vbd);
+}
+
+static int
+tapdisk_server_init_aio(void)
+{
+       return tapdisk_init_queue(&server.aio_queue, TAPDISK_TIOCBS,
+                                 TIO_DRV_LIO, NULL);
+}
+
+static void
+tapdisk_server_close_aio(void)
+{
+       tapdisk_free_queue(&server.aio_queue);
+}
+
+static void
+tapdisk_server_close(void)
+{
+       tapdisk_server_close_aio();
+}
+
+void
+tapdisk_server_iterate(void)
+{
+       int ret;
+
+       tapdisk_server_assert_locks();
+       tapdisk_server_set_retry_timeout();
+       tapdisk_server_check_progress();
+
+       ret = scheduler_wait_for_events(&server.scheduler);
+       if (ret < 0)
+               DBG(TLOG_WARN, "server wait returned %d\n", ret);
+
+       tapdisk_server_check_vbds();
+       tapdisk_server_submit_tiocbs();
+       tapdisk_server_kick_responses();
+}
+
+static void
+__tapdisk_server_run(void)
+{
+       while (server.run)
+               tapdisk_server_iterate();
+}
+
+static void
+tapdisk_server_signal_handler(int signal)
+{
+       td_vbd_t *vbd, *tmp;
+       static int xfsz_error_sent = 0;
+
+       switch (signal) {
+       case SIGBUS:
+       case SIGINT:
+               tapdisk_server_for_each_vbd(vbd, tmp)
+                       tapdisk_vbd_close(vbd);
+               break;
+
+       case SIGXFSZ:
+               ERR(EFBIG, "received SIGXFSZ");
+               tapdisk_server_stop_vbds();
+               if (xfsz_error_sent)
+                       break;
+
+               xfsz_error_sent = 1;
+               break;
+
+       case SIGUSR1:
+               tapdisk_server_debug();
+               break;
+       }
+}
+
+int
+tapdisk_server_init(void)
+{
+       memset(&server, 0, sizeof(server));
+       INIT_LIST_HEAD(&server.vbds);
+
+       scheduler_initialize(&server.scheduler);
+
+       return 0;
+}
+
+int
+tapdisk_server_complete(void)
+{
+       int err;
+
+       err = tapdisk_server_init_aio();
+       if (err)
+               goto fail;
+
+       server.run = 1;
+
+       return 0;
+
+fail:
+       tapdisk_server_close_aio();
+       return err;
+}
+
+int
+tapdisk_server_initialize(void)
+{
+       int err;
+
+       tapdisk_server_init();
+
+       err = tapdisk_server_complete();
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       tapdisk_server_close();
+       return err;
+}
+
+int
+tapdisk_server_run()
+{
+       int err;
+
+       err = tapdisk_set_resource_limits();
+       if (err)
+               return err;
+
+       signal(SIGBUS, tapdisk_server_signal_handler);
+       signal(SIGINT, tapdisk_server_signal_handler);
+       signal(SIGUSR1, tapdisk_server_signal_handler);
+       signal(SIGXFSZ, tapdisk_server_signal_handler);
+
+       __tapdisk_server_run();
+       tapdisk_server_close();
+
+       return 0;
+}
diff --git a/tools/blktap2/drivers/tapdisk-server.h b/tools/blktap2/drivers/tapdisk-server.h
new file mode 100644 (file)
index 0000000..d9c1a03
--- /dev/null
@@ -0,0 +1,67 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_SERVER_H_
+#define _TAPDISK_SERVER_H_
+
+#include "list.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-queue.h"
+
+struct tap_disk *tapdisk_server_find_driver_interface(int);
+
+td_image_t *tapdisk_server_get_shared_image(td_image_t *);
+
+struct list_head *tapdisk_server_get_all_vbds(void);
+td_vbd_t *tapdisk_server_get_vbd(td_uuid_t);
+void tapdisk_server_add_vbd(td_vbd_t *);
+void tapdisk_server_remove_vbd(td_vbd_t *);
+
+void tapdisk_server_queue_tiocb(struct tiocb *);
+
+void tapdisk_server_check_state(void);
+
+event_id_t tapdisk_server_register_event(char, int, int, event_cb_t, void *);
+void tapdisk_server_unregister_event(event_id_t);
+void tapdisk_server_set_max_timeout(int);
+
+int tapdisk_server_init(void);
+int tapdisk_server_initialize(void);
+int tapdisk_server_complete(void);
+int tapdisk_server_run(void);
+void tapdisk_server_iterate(void);
+
+#define TAPDISK_TIOCBS              (TAPDISK_DATA_REQUESTS + 50)
+
+typedef struct tapdisk_server {
+       int                          run;
+       struct list_head             vbds;
+       scheduler_t                  scheduler;
+       struct tqueue                aio_queue;
+} tapdisk_server_t;
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-stream.c b/tools/blktap2/drivers/tapdisk-stream.c
new file mode 100644 (file)
index 0000000..b5b0fa7
--- /dev/null
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "list.h"
+#include "scheduler.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+#include "tapdisk-disktype.h"
+#include "tapdisk-utils.h"
+
+#define POLL_READ                        0
+#define POLL_WRITE                       1
+
+#define MIN(a, b)                        ((a) < (b) ? (a) : (b))
+
+struct tapdisk_stream_poll {
+       int                              pipe[2];
+       int                              set;
+};
+
+struct tapdisk_stream_request {
+       uint64_t                         sec;
+       uint32_t                         secs;
+       uint64_t                         seqno;
+       blkif_request_t                  blkif_req;
+       struct list_head                 next;
+};
+
+struct tapdisk_stream {
+       td_vbd_t                        *vbd;
+
+       unsigned int                     id;
+       int                              in_fd;
+       int                              out_fd;
+
+       int                              err;
+
+       uint64_t                         cur;
+       uint64_t                         start;
+       uint64_t                         end;
+
+       uint64_t                         started;
+       uint64_t                         completed;
+
+       struct tapdisk_stream_poll       poll;
+       event_id_t                       enqueue_event_id;
+
+       struct list_head                 free_list;
+       struct list_head                 pending_list;
+       struct list_head                 completed_list;
+
+       struct tapdisk_stream_request    requests[MAX_REQUESTS];
+};
+
+static unsigned int tapdisk_stream_count;
+
+static void tapdisk_stream_close_image(struct tapdisk_stream *);
+
+static void
+usage(const char *app, int err)
+{
+       printf("usage: %s <-n type:/path/to/image> "
+              "[-c sector count] [-s skip sectors]\n", app);
+       exit(err);
+}
+
+static inline void
+tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p)
+{
+       p->set = 0;
+       p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1;
+}
+
+static int
+tapdisk_stream_poll_open(struct tapdisk_stream_poll *p)
+{
+       int err;
+
+       tapdisk_stream_poll_initialize(p);
+
+       err = pipe(p->pipe);
+       if (err)
+               return -errno;
+
+       err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK);
+       if (err)
+               goto out;
+
+       err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK);
+       if (err)
+               goto out;
+
+       return 0;
+
+out:
+       close(p->pipe[POLL_READ]);
+       close(p->pipe[POLL_WRITE]);
+       tapdisk_stream_poll_initialize(p);
+       return -errno;
+}
+
+static void
+tapdisk_stream_poll_close(struct tapdisk_stream_poll *p)
+{
+       if (p->pipe[POLL_READ] != -1)
+               close(p->pipe[POLL_READ]);
+       if (p->pipe[POLL_WRITE] != -1)
+               close(p->pipe[POLL_WRITE]);
+       tapdisk_stream_poll_initialize(p);
+}
+
+static inline void
+tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p)
+{
+       int dummy;
+
+       read_exact(p->pipe[POLL_READ], &dummy, sizeof(dummy));
+       p->set = 0;
+}
+
+static inline void
+tapdisk_stream_poll_set(struct tapdisk_stream_poll *p)
+{
+       int dummy = 0;
+
+       if (!p->set) {
+               write_exact(p->pipe[POLL_WRITE], &dummy, sizeof(dummy));
+               p->set = 1;
+       }
+}
+
+static inline int
+tapdisk_stream_stop(struct tapdisk_stream *s)
+{
+       return (list_empty(&s->pending_list) && (s->cur == s->end || s->err));
+}
+
+static inline void
+tapdisk_stream_initialize_request(struct tapdisk_stream_request *req)
+{
+       memset(req, 0, sizeof(*req));
+       INIT_LIST_HEAD(&req->next);
+}
+
+static inline int
+tapdisk_stream_request_idx(struct tapdisk_stream *s,
+                          struct tapdisk_stream_request *req)
+{
+       return (req - s->requests);
+}
+
+static inline struct tapdisk_stream_request *
+tapdisk_stream_get_request(struct tapdisk_stream *s)
+{
+       struct tapdisk_stream_request *req;
+
+       if (list_empty(&s->free_list))
+               return NULL;
+
+       req = list_entry(s->free_list.next,
+                        struct tapdisk_stream_request, next);
+
+       list_del_init(&req->next);
+       tapdisk_stream_initialize_request(req);
+
+       return req;
+}
+
+static void
+tapdisk_stream_print_request(struct tapdisk_stream *s,
+                            struct tapdisk_stream_request *sreq)
+{
+       unsigned long idx = (unsigned long)tapdisk_stream_request_idx(s, sreq);
+       char *buf = (char *)MMAP_VADDR(s->vbd->ring.vstart, idx, 0);
+       write_exact(s->out_fd, buf, sreq->secs << SECTOR_SHIFT);
+}
+
+static void
+tapdisk_stream_write_data(struct tapdisk_stream *s)
+{
+       struct tapdisk_stream_request *sreq, *tmp;
+
+       list_for_each_entry_safe(sreq, tmp, &s->completed_list, next) {
+               if (sreq->seqno != s->completed)
+                       break;
+
+               s->completed++;
+               tapdisk_stream_print_request(s, sreq);
+
+               list_del_init(&sreq->next);
+               list_add_tail(&sreq->next, &s->free_list);
+       }
+}
+
+static inline void
+tapdisk_stream_queue_completed(struct tapdisk_stream *s,
+                              struct tapdisk_stream_request *sreq)
+{
+       struct tapdisk_stream_request *itr;
+
+       list_for_each_entry(itr, &s->completed_list, next)
+               if (sreq->seqno < itr->seqno) {
+                       list_add_tail(&sreq->next, &itr->next);
+                       return;
+               }
+
+       list_add_tail(&sreq->next, &s->completed_list);
+}
+
+static void
+tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp)
+{
+       struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+       struct tapdisk_stream_request *sreq = s->requests + rsp->id;
+
+       list_del_init(&sreq->next);
+
+       if (rsp->status == BLKIF_RSP_OKAY)
+               tapdisk_stream_queue_completed(s, sreq);
+       else {
+               s->err = EIO;
+               list_add_tail(&sreq->next, &s->free_list);
+               fprintf(stderr, "error reading sector 0x%"PRIu64"\n", sreq->sec);
+       }
+
+       tapdisk_stream_write_data(s);
+       tapdisk_stream_poll_set(&s->poll);
+}
+
+static void
+tapdisk_stream_enqueue(event_id_t id, char mode, void *arg)
+{
+       td_vbd_t *vbd;
+       int i, idx, psize;
+       struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+
+       vbd = s->vbd;
+       tapdisk_stream_poll_clear(&s->poll);
+
+       if (tapdisk_stream_stop(s)) {
+               tapdisk_stream_close_image(s);
+               return;
+       }
+
+       psize = getpagesize();
+
+       while (s->cur < s->end && !s->err) {
+               blkif_request_t *breq;
+               td_vbd_request_t *vreq;
+               struct tapdisk_stream_request *sreq;
+
+               sreq = tapdisk_stream_get_request(s);
+               if (!sreq)
+                       break;
+
+               idx                 = tapdisk_stream_request_idx(s, sreq);
+
+               sreq->sec           = s->cur;
+               sreq->secs          = 0;
+               sreq->seqno         = s->started++;
+
+               breq                = &sreq->blkif_req;
+               breq->id            = idx;
+               breq->nr_segments   = 0;
+               breq->sector_number = sreq->sec;
+               breq->operation     = BLKIF_OP_READ;
+
+               for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) {
+                       uint32_t secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT);
+                       struct blkif_request_segment *seg = breq->seg + i;
+
+                       if (!secs)
+                               break;
+
+                       sreq->secs += secs;
+                       s->cur     += secs;
+
+                       seg->first_sect = 0;
+                       seg->last_sect  = secs - 1;
+                       breq->nr_segments++;
+               }
+
+               vreq = vbd->request_list + idx;
+
+               assert(list_empty(&vreq->next));
+               assert(vreq->secs_pending == 0);
+
+               memcpy(&vreq->req, breq, sizeof(*breq));
+               vbd->received++;
+               vreq->vbd = vbd;
+
+               tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+               list_add_tail(&sreq->next, &s->pending_list);
+       }
+
+       tapdisk_vbd_issue_requests(vbd);
+}
+
+static int
+tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type)
+{
+       int err;
+
+       s->id = tapdisk_stream_count++;
+
+       err = tapdisk_server_initialize();
+       if (err)
+               goto out;
+
+       err = tapdisk_vbd_initialize(s->id);
+       if (err)
+               goto out;
+
+       s->vbd = tapdisk_server_get_vbd(s->id);
+       if (!s->vbd) {
+               err = ENODEV;
+               goto out;
+       }
+
+       tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s);
+
+       err = tapdisk_vbd_open_vdi(s->vbd, path, type,
+                                  TAPDISK_STORAGE_TYPE_DEFAULT,
+                                  TD_OPEN_RDONLY);
+       if (err)
+               goto out;
+
+       s->vbd->reopened = 1;
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to open %s: %d\n", path, err);
+       return err;
+}
+
+static void
+tapdisk_stream_close_image(struct tapdisk_stream *s)
+{
+       td_vbd_t *vbd;
+
+       vbd = tapdisk_server_get_vbd(s->id);
+       if (vbd) {
+               tapdisk_vbd_close_vdi(vbd);
+               tapdisk_server_remove_vbd(vbd);
+               free((void *)vbd->ring.vstart);
+               free(vbd->name);
+               free(vbd);
+               s->vbd = NULL;
+       }
+}
+
+static int
+tapdisk_stream_set_position(struct tapdisk_stream *s,
+                           uint64_t count, uint64_t skip)
+{
+       int err;
+       image_t image;
+
+       err = tapdisk_vbd_get_image_info(s->vbd, &image);
+       if (err) {
+               fprintf(stderr, "failed getting image size: %d\n", err);
+               return err;
+       }
+
+       if (count == (uint64_t)-1)
+               count = image.size - skip;
+
+       if (count + skip > image.size) {
+               fprintf(stderr, "0x%"PRIx64" past end of image 0x%"PRIx64"\n",
+                       (uint64_t) (count + skip), (uint64_t) image.size);
+               return -EINVAL;
+       }
+
+       s->start = skip;
+       s->cur   = s->start;
+       s->end   = s->start + count;
+
+       return 0;
+}
+
+static int
+tapdisk_stream_initialize_requests(struct tapdisk_stream *s)
+{
+       size_t size;
+       td_ring_t *ring;
+       int err, i, psize;
+
+       ring  = &s->vbd->ring;
+       psize = getpagesize();
+       size  = psize * BLKTAP_MMAP_REGION_SIZE;
+
+       /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */
+       err = posix_memalign((void **)&ring->vstart, psize, size);
+       if (err) {
+               fprintf(stderr, "failed to allocate buffers: %d\n", err);
+               ring->vstart = 0;
+               return err;
+       }
+
+       for (i = 0; i < MAX_REQUESTS; i++) {
+               struct tapdisk_stream_request *req = s->requests + i;
+               tapdisk_stream_initialize_request(req);
+               list_add_tail(&req->next, &s->free_list);
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s)
+{
+       int err;
+       struct tapdisk_stream_poll *p = &s->poll;
+
+       err = tapdisk_stream_poll_open(p);
+       if (err)
+               goto out;
+
+       err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           p->pipe[POLL_READ], 0,
+                                           tapdisk_stream_enqueue, s);
+       if (err < 0)
+               goto out;
+
+       s->enqueue_event_id = err;
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to register event: %d\n", err);
+       return err;
+}
+
+static void
+tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s)
+{
+       if (s->enqueue_event_id) {
+               tapdisk_server_unregister_event(s->enqueue_event_id);
+               s->enqueue_event_id = 0;
+       }
+       tapdisk_stream_poll_close(&s->poll);
+}
+
+static inline void
+tapdisk_stream_initialize(struct tapdisk_stream *s)
+{
+       memset(s, 0, sizeof(*s));
+       s->in_fd = s->out_fd = -1;
+       INIT_LIST_HEAD(&s->free_list);
+       INIT_LIST_HEAD(&s->pending_list);
+       INIT_LIST_HEAD(&s->completed_list);
+}
+
+static int
+tapdisk_stream_open_fds(struct tapdisk_stream *s)
+{
+       s->out_fd = dup(STDOUT_FILENO);
+       if (s->out_fd == -1) {
+               fprintf(stderr, "failed to open output: %d\n", errno);
+               return errno;
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_stream_open(struct tapdisk_stream *s, const char *path,
+                   int type, uint64_t count, uint64_t skip)
+{
+       int err;
+
+       tapdisk_stream_initialize(s);
+
+       err = tapdisk_stream_open_fds(s);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_open_image(s, path, type);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_set_position(s, count, skip);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_initialize_requests(s);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_register_enqueue_event(s);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static void
+tapdisk_stream_release(struct tapdisk_stream *s)
+{
+       close(s->out_fd);
+       tapdisk_stream_close_image(s);
+       tapdisk_stream_unregister_enqueue_event(s);
+}
+
+static int
+tapdisk_stream_run(struct tapdisk_stream *s)
+{
+       tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s);
+       tapdisk_server_run();
+       return s->err;
+}
+
+int
+main(int argc, char *argv[])
+{
+       int c, err, type;
+       const char *params;
+       const disk_info_t *info;
+       const char *path;
+       uint64_t count, skip;
+       struct tapdisk_stream stream;
+
+       err    = 0;
+       skip   = 0;
+       count  = (uint64_t)-1;
+       params = NULL;
+
+       while ((c = getopt(argc, argv, "n:c:s:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       params = optarg;
+                       break;
+               case 'c':
+                       count = strtoull(optarg, NULL, 10);
+                       break;
+               case 's':
+                       skip = strtoull(optarg, NULL, 10);
+                       break;
+               default:
+                       err = EINVAL;
+               case 'h':
+                       usage(argv[0], err);
+               }
+       }
+
+       if (!params)
+               usage(argv[0], EINVAL);
+
+       type = tapdisk_disktype_parse_params(params, &path);
+       if (type < 0) {
+               err = type;
+               fprintf(stderr, "invalid argument %s: %d\n", params, err);
+               return err;
+       }
+
+       tapdisk_start_logging("tapdisk-stream");
+
+       err = tapdisk_stream_open(&stream, path, type, count, skip);
+       if (err)
+               goto out;
+
+       err = tapdisk_stream_run(&stream);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       tapdisk_stream_release(&stream);
+       tapdisk_stop_logging();
+       return err;
+}
diff --git a/tools/blktap2/drivers/tapdisk-utils.c b/tools/blktap2/drivers/tapdisk-utils.c
new file mode 100644 (file)
index 0000000..44840ef
--- /dev/null
@@ -0,0 +1,214 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/utsname.h>
+#ifdef __linux__
+#include <linux/version.h>
+#endif
+
+#include "blk.h"
+#include "tapdisk.h"
+#include "blktaplib.h"
+#include "tapdisk-log.h"
+#include "tapdisk-utils.h"
+
+void
+tapdisk_start_logging(const char *name)
+{
+       static char buf[128];
+
+       snprintf(buf, sizeof(buf), "%s[%d]", name, getpid());
+       openlog(buf, LOG_CONS | LOG_ODELAY, LOG_DAEMON);
+       open_tlog("/tmp/tapdisk.log", (64 << 10), TLOG_WARN, 0);
+}
+
+void
+tapdisk_stop_logging(void)
+{
+       closelog();
+       close_tlog();
+}
+
+int
+tapdisk_set_resource_limits(void)
+{
+       int err;
+       struct rlimit rlim;
+
+       rlim.rlim_cur = RLIM_INFINITY;
+       rlim.rlim_max = RLIM_INFINITY;
+
+       err = setrlimit(RLIMIT_MEMLOCK, &rlim);
+       if (err == -1) {
+               EPRINTF("RLIMIT_MEMLOCK failed: %d\n", errno);
+               return -errno;
+       }
+
+       err = mlockall(MCL_CURRENT | MCL_FUTURE);
+       if (err == -1) {
+               EPRINTF("mlockall failed: %d\n", errno);
+               return -errno;
+       }
+
+#define CORE_DUMP
+#if defined(CORE_DUMP)
+       err = setrlimit(RLIMIT_CORE, &rlim);
+       if (err == -1)
+               EPRINTF("RLIMIT_CORE failed: %d\n", errno);
+#endif
+
+       return 0;
+}
+
+int
+tapdisk_namedup(char **dup, const char *name)
+{
+       *dup = NULL;
+
+       if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN)
+               return -ENAMETOOLONG;
+       
+       *dup = strdup(name);
+       if (!*dup)
+               return -ENOMEM;
+
+       return 0;
+}
+
+/*Get Image size, secsize*/
+int
+tapdisk_get_image_size(int fd, uint64_t *_sectors, uint32_t *_sector_size)
+{
+       int ret;
+       struct stat stat;
+       uint64_t sectors;
+       uint64_t sector_size;
+
+       sectors       = 0;
+       sector_size   = 0;
+       *_sectors     = 0;
+       *_sector_size = 0;
+
+       if (fstat(fd, &stat)) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               if (blk_getimagesize(fd, &sectors) != 0)
+                       return -EINVAL;
+
+               /*Get the sector size*/
+               if (blk_getsectorsize(fd, &sector_size) != 0)
+                       sector_size = DEFAULT_SECTOR_SIZE;
+       } else {
+               /*Local file? try fstat instead*/
+               sectors     = (stat.st_size >> SECTOR_SHIFT);
+               sector_size = DEFAULT_SECTOR_SIZE;
+       }
+
+       if (sectors == 0) {             
+               sectors     = 16836057ULL;
+               sector_size = DEFAULT_SECTOR_SIZE;
+       }
+
+       return 0;
+}
+
+#ifdef __linux__
+
+int tapdisk_linux_version(void)
+{
+       struct utsname uts;
+       unsigned int version, patchlevel, sublevel;
+       int n, err;
+
+       err = uname(&uts);
+       if (err)
+               return -errno;
+
+       n = sscanf(uts.release, "%u.%u.%u", &version, &patchlevel, &sublevel);
+       if (n != 3)
+               return -ENOSYS;
+
+       return KERNEL_VERSION(version, patchlevel, sublevel);
+}
+
+#else
+
+int tapdisk_linux_version(void)
+{
+       return -ENOSYS;
+}
+
+#endif
+int read_exact(int fd, void *data, size_t size)
+{
+    size_t offset = 0;
+    ssize_t len;
+
+    while ( offset < size )
+    {
+        len = read(fd, (char *)data + offset, size - offset);
+        if ( (len == -1) && (errno == EINTR) )
+            continue;
+        if ( len == 0 )
+            errno = 0;
+        if ( len <= 0 )
+            return -1;
+        offset += len;
+    }
+
+    return 0;
+}
+
+int write_exact(int fd, const void *data, size_t size)
+{
+    size_t offset = 0;
+    ssize_t len;
+
+    while ( offset < size )
+    {
+        len = write(fd, (const char *)data + offset, size - offset);
+        if ( (len == -1) && (errno == EINTR) )
+            continue;
+        if ( len <= 0 )
+            return -1;
+        offset += len;
+    }
+
+    return 0;
+}
diff --git a/tools/blktap2/drivers/tapdisk-utils.h b/tools/blktap2/drivers/tapdisk-utils.h
new file mode 100644 (file)
index 0000000..aced8ef
--- /dev/null
@@ -0,0 +1,45 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_UTILS_H_
+#define _TAPDISK_UTILS_H_
+
+#include <inttypes.h>
+
+#define MAX_NAME_LEN                 1000
+
+void tapdisk_start_logging(const char *);
+void tapdisk_stop_logging(void);
+int tapdisk_set_resource_limits(void);
+int tapdisk_namedup(char **, const char *);
+int tapdisk_get_image_size(int, uint64_t *, uint32_t *);
+int tapdisk_linux_version(void);
+
+int read_exact(int fd, void *data, size_t size); /* EOF => -1, errno=0 */
+int write_exact(int fd, const void *data, size_t size);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-vbd.c b/tools/blktap2/drivers/tapdisk-vbd.c
new file mode 100644 (file)
index 0000000..fd4999a
--- /dev/null
@@ -0,0 +1,1723 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <regex.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <libgen.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#ifdef MEMSHR
+#include <memshr.h>
+#endif
+
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+#include "tapdisk-vbd.h"
+#include "blktap2.h"
+
+#define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a)
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+#if 1
+#define ASSERT(p)                                                      \
+       do {                                                            \
+               if (!(p)) {                                             \
+                       DPRINTF("Assertion '%s' failed, line %d, "      \
+                               "file %s", #p, __LINE__, __FILE__);     \
+                       abort();                                        \
+               }                                                       \
+       } while (0)
+#else
+#define ASSERT(p) ((void)0)
+#endif
+
+
+#define TD_VBD_EIO_RETRIES          10
+#define TD_VBD_EIO_SLEEP            1
+#define TD_VBD_WATCHDOG_TIMEOUT     10
+
+static void tapdisk_vbd_ring_event(event_id_t, char, void *);
+static void tapdisk_vbd_callback(void *, blkif_response_t *);
+
+/* 
+ * initialization
+ */
+
+static inline void
+tapdisk_vbd_initialize_vreq(td_vbd_request_t *vreq)
+{
+       memset(vreq, 0, sizeof(td_vbd_request_t));
+       INIT_LIST_HEAD(&vreq->next);
+}
+
+void
+tapdisk_vbd_free(td_vbd_t *vbd)
+{
+       if (vbd) {
+               tapdisk_vbd_free_stack(vbd);
+               list_del_init(&vbd->next);
+               free(vbd->name);
+               free(vbd);
+       }
+}
+
+td_vbd_t*
+tapdisk_vbd_create(uint16_t uuid)
+{
+       td_vbd_t *vbd;
+       int i;
+
+       vbd = calloc(1, sizeof(td_vbd_t));
+       if (!vbd) {
+               EPRINTF("failed to allocate tapdisk state\n");
+               return NULL;
+       }
+
+       vbd->uuid     = uuid;
+       vbd->minor    = -1;
+       vbd->ring.fd  = -1;
+
+       /* default blktap ring completion */
+       vbd->callback = tapdisk_vbd_callback;
+       vbd->argument = vbd;
+    
+#ifdef MEMSHR
+       memshr_vbd_initialize();
+#endif
+
+       INIT_LIST_HEAD(&vbd->driver_stack);
+       INIT_LIST_HEAD(&vbd->images);
+       INIT_LIST_HEAD(&vbd->new_requests);
+       INIT_LIST_HEAD(&vbd->pending_requests);
+       INIT_LIST_HEAD(&vbd->failed_requests);
+       INIT_LIST_HEAD(&vbd->completed_requests);
+       INIT_LIST_HEAD(&vbd->next);
+       gettimeofday(&vbd->ts, NULL);
+
+       for (i = 0; i < MAX_REQUESTS; i++)
+               tapdisk_vbd_initialize_vreq(vbd->request_list + i);
+
+       return vbd;
+}
+
+int
+tapdisk_vbd_initialize(uint16_t uuid)
+{
+       td_vbd_t *vbd;
+
+       vbd = tapdisk_server_get_vbd(uuid);
+       if (vbd) {
+               EPRINTF("duplicate vbds! %u\n", uuid);
+               return -EEXIST;
+       }
+
+       vbd = tapdisk_vbd_create(uuid);
+
+       tapdisk_server_add_vbd(vbd);
+
+       return 0;
+}
+
+void
+tapdisk_vbd_set_callback(td_vbd_t *vbd, td_vbd_cb_t callback, void *argument)
+{
+       vbd->callback = callback;
+       vbd->argument = argument;
+}
+
+static int
+tapdisk_vbd_validate_chain(td_vbd_t *vbd)
+{
+       int err;
+       td_image_t *image, *parent, *tmp;
+
+       DPRINTF("VBD CHAIN:\n");
+
+       tapdisk_vbd_for_each_image(vbd, image, tmp) {
+               DPRINTF("%s: %d\n", image->name, image->type);
+
+               if (tapdisk_vbd_is_last_image(vbd, image))
+                       break;
+
+               parent = tapdisk_vbd_next_image(image);
+               err    = td_validate_parent(image, parent);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+void
+tapdisk_vbd_close_vdi(td_vbd_t *vbd)
+{
+       td_image_t *image, *tmp;
+
+       tapdisk_vbd_for_each_image(vbd, image, tmp) {
+               td_close(image);
+               tapdisk_image_free(image);
+       }
+
+       INIT_LIST_HEAD(&vbd->images);
+       td_flag_set(vbd->state, TD_VBD_CLOSED);
+
+       tapdisk_vbd_free_stack(vbd);
+}
+
+static int
+tapdisk_vbd_add_block_cache(td_vbd_t *vbd)
+{
+       int err;
+       td_driver_t *driver;
+       td_image_t *cache, *image, *target, *tmp;
+
+       target = NULL;
+
+       tapdisk_vbd_for_each_image(vbd, image, tmp)
+               if (td_flag_test(image->flags, TD_OPEN_RDONLY) &&
+                   td_flag_test(image->flags, TD_OPEN_SHAREABLE)) {
+                       target = image;
+                       break;
+               }
+
+       if (!target)
+               return 0;
+
+       cache = tapdisk_image_allocate(target->name,
+                                      DISK_TYPE_BLOCK_CACHE,
+                                      target->storage,
+                                      target->flags,
+                                      target->private);
+       if (!cache)
+               return -ENOMEM;
+
+       /* try to load existing cache */
+       err = td_load(cache);
+       if (!err)
+               goto done;
+
+       /* hack driver to send open() correct image size */
+       if (!target->driver) {
+               err = -ENODEV;
+               goto fail;
+       }
+
+       cache->driver = tapdisk_driver_allocate(cache->type,
+                                               cache->name,
+                                               cache->flags,
+                                               cache->storage);
+       if (!cache->driver) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       cache->driver->info = target->driver->info;
+
+       /* try to open new cache */
+       err = td_open(cache);
+       if (!err)
+               goto done;
+
+fail:
+       /* give up */
+       tapdisk_image_free(target);
+       return err;
+
+done:
+       /* insert cache before image */
+       list_add(&cache->next, target->next.prev);
+       return 0;
+}
+
+static int
+tapdisk_vbd_add_dirty_log(td_vbd_t *vbd)
+{
+       int err;
+       td_driver_t *driver;
+       td_image_t *log, *parent;
+
+       driver = NULL;
+       log    = NULL;
+
+       parent = tapdisk_vbd_first_image(vbd);
+
+       log    = tapdisk_image_allocate(parent->name,
+                                       DISK_TYPE_LOG,
+                                       parent->storage,
+                                       parent->flags,
+                                       vbd);
+       if (!log)
+               return -ENOMEM;
+
+       driver = tapdisk_driver_allocate(log->type,
+                                        log->name,
+                                        log->flags,
+                                        log->storage);
+       if (!driver) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       driver->info = parent->driver->info;
+       log->driver  = driver;
+
+       err = td_open(log);
+       if (err)
+               goto fail;
+
+       list_add(&log->next, &vbd->images);
+       return 0;
+
+fail:
+       tapdisk_image_free(log);
+       return err;
+}
+
+static int
+tapdisk_vbd_open_level(td_vbd_t *vbd, struct list_head *head,
+                      const char *params, int driver_type,
+                      td_disk_info_t *driver_info, td_flag_t flags)
+{
+       const char *name;
+       int type, err;
+       td_image_t *image;
+       td_disk_id_t id;
+       td_driver_t *driver;
+
+       name    = params;
+       id.name = NULL;
+       type    = driver_type;
+       INIT_LIST_HEAD(head);
+
+       for (;;) {
+               err   = -ENOMEM;
+               image = tapdisk_image_allocate(name, type,
+                                              vbd->storage, flags, vbd);
+
+               free(id.name);
+
+               if (!image)
+                       goto out;
+
+
+               /* this breaks if a driver modifies its info within a layer */
+               err = __td_open(image, driver_info);
+               if (err)
+                       goto out;
+
+               /* TODO: non-sink drivers that don't care about their child
+                * currently return EINVAL. Could return TD_PARENT_OK or
+                * TD_ANY_PARENT */
+
+               err = td_get_parent_id(image, &id);
+               if (err && (err != TD_NO_PARENT && err != -EINVAL)) {
+                       td_close(image);
+                       goto out;
+               }
+
+               /* add this image to the end of the list */
+               list_add_tail(&image->next, head);
+               image = NULL;
+
+               /* if the image does not have a parent we return the
+                * list of images generated by this level of the stack */
+               if (err == TD_NO_PARENT || err == -EINVAL) {
+                       err = 0;
+                       goto out;
+               }
+
+               name   = id.name;
+               type   = id.drivertype;
+
+               flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE);
+       }
+
+out:
+       if (err) {
+               if (image) {
+                       td_close(image);
+                       tapdisk_image_free(image);
+               }
+               while (!list_empty(head)) {
+                       image = list_entry(&head->next, td_image_t, next);
+                       td_close(image);
+                       tapdisk_image_free(image);
+               }
+       }
+
+       return err;
+}
+
+static int
+__tapdisk_vbd_open_vdi(td_vbd_t *vbd, td_flag_t extra_flags)
+{
+       int err;
+       td_flag_t flags;
+       td_image_t *tmp;
+       td_vbd_driver_info_t *driver_info;
+       struct list_head *images;
+       td_disk_info_t *parent_info = NULL;
+
+       if (list_empty(&vbd->driver_stack))
+               return -ENOENT;
+
+       flags = (vbd->flags & ~TD_OPEN_SHAREABLE) | extra_flags;
+
+       /* loop on each user specified driver.
+        * NOTE: driver_info is in reverse order. That is, the first
+        * item is the 'parent' or 'sink' driver */
+       list_for_each_entry(driver_info, &vbd->driver_stack, next) {
+               LIST_HEAD(images);
+
+               err = tapdisk_vbd_open_level(vbd, &images,
+                                            driver_info->params,
+                                            driver_info->type,
+                                            parent_info, flags);
+               if (err)
+                       goto fail;
+
+               /* after each loop, 
+                * append the created stack to the result stack */
+               list_splice(&images, &vbd->images);
+
+               /* set the parent_info to the first diskinfo on the stack */
+               tmp = tapdisk_vbd_first_image(vbd);
+               parent_info = &tmp->info;
+       }
+
+       if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) {
+               err = tapdisk_vbd_add_dirty_log(vbd);
+               if (err)
+                       goto fail;
+       }
+
+       if (td_flag_test(vbd->flags, TD_OPEN_ADD_CACHE)) {
+               err = tapdisk_vbd_add_block_cache(vbd);
+               if (err)
+                       goto fail;
+       }
+
+       err = tapdisk_vbd_validate_chain(vbd);
+       if (err)
+               goto fail;
+
+       td_flag_clear(vbd->state, TD_VBD_CLOSED);
+
+       return 0;
+
+fail:
+       tapdisk_vbd_close_vdi(vbd);
+       return err;
+}
+
+/* this populates a vbd type based on path */
+int
+tapdisk_vbd_parse_stack(td_vbd_t *vbd, const char *path)
+{
+       int err;
+       char *params, *driver_str;
+       td_vbd_driver_info_t *driver;
+
+       err = tapdisk_namedup(&params, path);
+       if (err)
+               return err;
+
+       /* tokenize params based on pipe '|' */
+       driver_str = strtok(params, "|");
+       while (driver_str != NULL) {
+               const char *path;
+               int type;
+
+               /* parse driver info and add to vbd */
+               driver = calloc(1, sizeof(td_vbd_driver_info_t));
+               if (!driver) {
+                       PERROR("malloc");
+                       err = -errno;
+                       goto out;
+               }
+               INIT_LIST_HEAD(&driver->next);
+
+               err = tapdisk_parse_disk_type(driver_str, &path, &type);
+               if (err) {
+                       free(driver);
+                       goto out;
+               }
+
+               driver->type   = type;
+               driver->params = strdup(path);
+               if (!driver->params) {
+                       err = -ENOMEM;
+                       free(driver);
+                       goto out;
+               }
+
+               /* build the list backwards as the last driver will be the
+                * first driver to open in the stack */
+               list_add(&driver->next, &vbd->driver_stack);
+
+               /* get next driver string */
+               driver_str = strtok(NULL, "|");
+       }
+
+out:
+       free(params);
+       if (err)
+               tapdisk_vbd_free_stack(vbd);
+
+       return err;
+}
+
+void
+tapdisk_vbd_free_stack(td_vbd_t *vbd)
+{
+       td_vbd_driver_info_t *driver;
+
+       while (!list_empty(&vbd->driver_stack)) {
+               driver = list_entry(vbd->driver_stack.next,
+                                   td_vbd_driver_info_t, next);
+               list_del(&driver->next);
+               free(driver->params);
+               free(driver);
+       }
+}
+
+/* NOTE: driver type, etc. must be set */
+int
+tapdisk_vbd_open_stack(td_vbd_t *vbd, uint16_t storage, td_flag_t flags)
+{
+       int i, err = 0;
+
+       vbd->flags   = flags;
+       vbd->storage = storage;
+
+       for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+               err = __tapdisk_vbd_open_vdi(vbd, 0);
+               if (err != -EIO)
+                       break;
+
+               sleep(TD_VBD_EIO_SLEEP);
+       }
+       if (err)
+               goto fail;
+
+       return 0;
+
+ fail:
+       return err;
+}
+
+int
+tapdisk_vbd_open_vdi(td_vbd_t *vbd, const char *path,
+                    uint16_t drivertype, uint16_t storage, td_flag_t flags)
+{
+       int i, err;
+       const struct tap_disk *ops;
+
+       ops = tapdisk_disk_drivers[drivertype];
+       if (!ops)
+               return -EINVAL;
+       DPRINTF("Loaded %s driver for vbd %u %s 0x%08x\n",
+               ops->disk_type, vbd->uuid, path, flags);
+
+       err = tapdisk_namedup(&vbd->name, path);
+       if (err)
+               return err;
+
+       vbd->flags   = flags;
+       vbd->storage = storage;
+
+       for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+               err = __tapdisk_vbd_open_vdi(vbd, 0);
+               if (err != -EIO)
+                       break;
+
+               sleep(TD_VBD_EIO_SLEEP);
+       }
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       free(vbd->name);
+       vbd->name = NULL;
+       return err;
+}
+
+static int
+tapdisk_vbd_register_event_watches(td_vbd_t *vbd)
+{
+       event_id_t id;
+
+       id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                          vbd->ring.fd, 0,
+                                          tapdisk_vbd_ring_event, vbd);
+       if (id < 0)
+               return id;
+
+       vbd->ring_event_id = id;
+
+       return 0;
+}
+
+static void
+tapdisk_vbd_unregister_events(td_vbd_t *vbd)
+{
+       if (vbd->ring_event_id)
+               tapdisk_server_unregister_event(vbd->ring_event_id);
+}
+
+static int
+tapdisk_vbd_map_device(td_vbd_t *vbd, const char *devname)
+{
+       
+       int err, psize;
+       td_ring_t *ring;
+
+       ring  = &vbd->ring;
+       psize = getpagesize();
+
+       ring->fd = open(devname, O_RDWR);
+       if (ring->fd == -1) {
+               err = -errno;
+               EPRINTF("failed to open %s: %d\n", devname, err);
+               goto fail;
+       }
+
+       ring->mem = mmap(0, psize * BLKTAP_MMAP_REGION_SIZE,
+                        PROT_READ | PROT_WRITE, MAP_SHARED, ring->fd, 0);
+       if (ring->mem == MAP_FAILED) {
+               err = -errno;
+               EPRINTF("failed to mmap %s: %d\n", devname, err);
+               goto fail;
+       }
+
+       ring->sring = (blkif_sring_t *)((unsigned long)ring->mem);
+       BACK_RING_INIT(&ring->fe_ring, ring->sring, psize);
+
+       ring->vstart =
+               (unsigned long)ring->mem + (BLKTAP_RING_PAGES * psize);
+
+       ioctl(ring->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE);
+
+       return 0;
+
+fail:
+       if (ring->mem && ring->mem != MAP_FAILED)
+               munmap(ring->mem, psize * BLKTAP_MMAP_REGION_SIZE);
+       if (ring->fd != -1)
+               close(ring->fd);
+       ring->fd  = -1;
+       ring->mem = NULL;
+       return err;
+}
+
+static int
+tapdisk_vbd_unmap_device(td_vbd_t *vbd)
+{
+       int psize;
+
+       psize = getpagesize();
+
+       if (vbd->ring.fd != -1)
+               close(vbd->ring.fd);
+       if (vbd->ring.mem > 0)
+               munmap(vbd->ring.mem, psize * BLKTAP_MMAP_REGION_SIZE);
+
+       return 0;
+}
+
+void
+tapdisk_vbd_detach(td_vbd_t *vbd)
+{
+       tapdisk_vbd_unregister_events(vbd);
+
+       tapdisk_vbd_unmap_device(vbd);
+       vbd->minor = -1;
+}
+
+
+int
+tapdisk_vbd_attach(td_vbd_t *vbd, const char *devname, int minor)
+{
+       int err;
+
+       err = tapdisk_vbd_map_device(vbd, devname);
+       if (err)
+               goto fail;
+
+       err = tapdisk_vbd_register_event_watches(vbd);
+       if (err)
+               goto fail;
+
+       vbd->minor = minor;
+
+       return 0;
+
+fail:
+       tapdisk_vbd_detach(vbd);
+
+       return err;
+}
+
+int
+tapdisk_vbd_open(td_vbd_t *vbd, const char *name, uint16_t type,
+                uint16_t storage, int minor, const char *ring, td_flag_t flags)
+{
+       int err;
+
+       err = tapdisk_vbd_open_stack(vbd, storage, flags);
+       if (err)
+               goto out;
+
+       err = tapdisk_vbd_attach(vbd, ring, minor);
+       if (err)
+               goto out;
+
+       return 0;
+
+out:
+       tapdisk_vbd_detach(vbd);
+       tapdisk_vbd_close_vdi(vbd);
+       free(vbd->name);
+       vbd->name = NULL;
+       return err;
+}
+
+static void
+tapdisk_vbd_queue_count(td_vbd_t *vbd, int *new,
+                       int *pending, int *failed, int *completed)
+{
+       int n, p, f, c;
+       td_vbd_request_t *vreq, *tvreq;
+
+       n = 0;
+       p = 0;
+       f = 0;
+       c = 0;
+
+       tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->new_requests)
+               n++;
+
+       tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->pending_requests)
+               p++;
+
+       tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->failed_requests)
+               f++;
+
+       tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->completed_requests)
+               c++;
+
+       *new       = n;
+       *pending   = p;
+       *failed    = f;
+       *completed = c;
+}
+
+static int
+tapdisk_vbd_shutdown(td_vbd_t *vbd)
+{
+       int new, pending, failed, completed;
+
+       if (!list_empty(&vbd->pending_requests))
+               return -EAGAIN;
+
+       tapdisk_vbd_kick(vbd);
+       tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed);
+
+       DPRINTF("%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, "
+               "failed: 0x%02x, completed: 0x%02x\n", 
+               vbd->name, vbd->state, new, pending, failed, completed);
+       DPRINTF("last activity: %010ld.%06lld, errors: 0x%04"PRIx64", "
+               "retries: 0x%04"PRIx64", received: 0x%08"PRIx64", "
+               "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n",
+               vbd->ts.tv_sec, (unsigned long long)vbd->ts.tv_usec,
+               vbd->errors, vbd->retries, vbd->received, vbd->returned,
+               vbd->kicked);
+
+       tapdisk_vbd_close_vdi(vbd);
+       tapdisk_vbd_detach(vbd);
+       tapdisk_server_remove_vbd(vbd);
+       tapdisk_vbd_free(vbd);
+
+       tlog_print_errors();
+
+       return 0;
+}
+
+int
+tapdisk_vbd_close(td_vbd_t *vbd)
+{
+       /*
+        * don't close if any requests are pending in the aio layer
+        */
+       if (!list_empty(&vbd->pending_requests))
+               goto fail;
+
+       /* 
+        * if the queue is still active and we have more
+        * requests, try to complete them before closing.
+        */
+       if (tapdisk_vbd_queue_ready(vbd) &&
+           (!list_empty(&vbd->new_requests) ||
+            !list_empty(&vbd->failed_requests) ||
+            !list_empty(&vbd->completed_requests)))
+               goto fail;
+
+       return tapdisk_vbd_shutdown(vbd);
+
+fail:
+       td_flag_set(vbd->state, TD_VBD_SHUTDOWN_REQUESTED);
+       DBG(TLOG_WARN, "%s: requests pending\n", vbd->name);
+       return -EAGAIN;
+}
+
+/*
+ * control operations
+ */
+
+void
+tapdisk_vbd_debug(td_vbd_t *vbd)
+{
+       td_image_t *image, *tmp;
+       int new, pending, failed, completed;
+
+       tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed);
+
+       DBG(TLOG_WARN, "%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, "
+           "failed: 0x%02x, completed: 0x%02x, last activity: %010ld.%06lld, "
+           "errors: 0x%04"PRIx64", retries: 0x%04"PRIx64", received: 0x%08"PRIx64", "
+           "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n",
+           vbd->name, vbd->state, new, pending, failed, completed,
+           vbd->ts.tv_sec, (unsigned long long)vbd->ts.tv_usec,
+           vbd->errors, vbd->retries,
+           vbd->received, vbd->returned, vbd->kicked);
+
+       tapdisk_vbd_for_each_image(vbd, image, tmp)
+               td_debug(image);
+}
+
+static void
+tapdisk_vbd_drop_log(td_vbd_t *vbd)
+{
+       if (td_flag_test(vbd->state, TD_VBD_LOG_DROPPED))
+               return;
+
+       tapdisk_vbd_debug(vbd);
+       tlog_flush();
+       td_flag_set(vbd->state, TD_VBD_LOG_DROPPED);
+}
+
+int
+tapdisk_vbd_get_image_info(td_vbd_t *vbd, image_t *img)
+{
+       td_image_t *image;
+
+       memset(img, 0, sizeof(image_t));
+
+       if (list_empty(&vbd->images))
+               return -EINVAL;
+
+       image        = tapdisk_vbd_first_image(vbd);
+       img->size    = image->info.size;
+       img->secsize = image->info.sector_size;
+       img->info    = image->info.info;
+
+       return 0;
+}
+
+int
+tapdisk_vbd_queue_ready(td_vbd_t *vbd)
+{
+       return (!td_flag_test(vbd->state, TD_VBD_DEAD) &&
+               !td_flag_test(vbd->state, TD_VBD_CLOSED) &&
+               !td_flag_test(vbd->state, TD_VBD_QUIESCED) &&
+               !td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED));
+}
+
+int
+tapdisk_vbd_retry_needed(td_vbd_t *vbd)
+{
+       return td_flag_test(vbd->state, TD_VBD_RETRY_NEEDED);
+}
+
+int
+tapdisk_vbd_lock(td_vbd_t *vbd)
+{
+       return 0;
+}
+
+int
+tapdisk_vbd_quiesce_queue(td_vbd_t *vbd)
+{
+       if (!list_empty(&vbd->pending_requests)) {
+               td_flag_set(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+               return -EAGAIN;
+       }
+
+       td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+       td_flag_set(vbd->state, TD_VBD_QUIESCED);
+       return 0;
+}
+
+int
+tapdisk_vbd_start_queue(td_vbd_t *vbd)
+{
+       td_flag_clear(vbd->state, TD_VBD_QUIESCED);
+       td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+       return 0;
+}
+
+int
+tapdisk_vbd_kill_queue(td_vbd_t *vbd)
+{
+       tapdisk_vbd_quiesce_queue(vbd);
+       td_flag_set(vbd->state, TD_VBD_DEAD);
+       return 0;
+}
+
+static int
+tapdisk_vbd_open_image(td_vbd_t *vbd, td_image_t *image)
+{
+       int err;
+       td_image_t *parent;
+
+       err = td_open(image);
+       if (err)
+               return err;
+
+       if (!tapdisk_vbd_is_last_image(vbd, image)) {
+               parent = tapdisk_vbd_next_image(image);
+               err    = td_validate_parent(image, parent);
+               if (err) {
+                       td_close(image);
+                       return err;
+               }
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_vbd_close_and_reopen_image(td_vbd_t *vbd, td_image_t *image)
+{
+       int i, err = 0;
+
+       td_close(image);
+
+       for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+               err = tapdisk_vbd_open_image(vbd, image);
+               if (err != -EIO)
+                       break;
+
+               sleep(TD_VBD_EIO_SLEEP);
+       }
+
+       if (err)
+               td_flag_set(vbd->state, TD_VBD_CLOSED);
+
+       return err;
+}
+
+int
+tapdisk_vbd_pause(td_vbd_t *vbd)
+{
+       int err;
+
+       td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
+
+       err = tapdisk_vbd_quiesce_queue(vbd);
+       if (err)
+               return err;
+
+       tapdisk_vbd_close_vdi(vbd);
+
+       td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+       td_flag_set(vbd->state, TD_VBD_PAUSED);
+
+       return 0;
+}
+
+int
+tapdisk_vbd_resume(td_vbd_t *vbd, const char *path, uint16_t drivertype)
+{
+       int i, err = 0;
+
+       if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
+               EPRINTF("resume request for unpaused vbd %s\n", vbd->name);
+               return -EINVAL;
+       }
+
+       if (path) {
+               free(vbd->name);
+               vbd->name = strdup(path);
+               if (!vbd->name) {
+                       EPRINTF("copying new vbd %s name failed\n", path);
+                       return -EINVAL;
+               }
+       }
+
+       for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+               err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT);
+               if (err != -EIO)
+                       break;
+
+               sleep(TD_VBD_EIO_SLEEP);
+       }
+
+       if (err)
+               return err;
+
+       tapdisk_vbd_start_queue(vbd);
+       td_flag_clear(vbd->state, TD_VBD_PAUSED);
+       td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+       tapdisk_vbd_check_state(vbd);
+
+       return 0;
+}
+
+int
+tapdisk_vbd_kick(td_vbd_t *vbd)
+{
+       int n;
+       td_ring_t *ring;
+
+       tapdisk_vbd_check_state(vbd);
+
+       ring = &vbd->ring;
+       if (!ring->sring)
+               return 0;
+
+       n    = (ring->fe_ring.rsp_prod_pvt - ring->fe_ring.sring->rsp_prod);
+       if (!n)
+               return 0;
+
+       vbd->kicked += n;
+       RING_PUSH_RESPONSES(&ring->fe_ring);
+       ioctl(ring->fd, BLKTAP_IOCTL_KICK_FE, 0);
+
+       DBG(TLOG_INFO, "kicking %d: rec: 0x%08"PRIx64", ret: 0x%08"PRIx64", kicked: "
+           "0x%08"PRIx64"\n", n, vbd->received, vbd->returned, vbd->kicked);
+
+       return n;
+}
+
+static inline void
+tapdisk_vbd_write_response_to_ring(td_vbd_t *vbd, blkif_response_t *rsp)
+{
+       td_ring_t *ring;
+       blkif_response_t *rspp;
+
+       ring = &vbd->ring;
+       rspp = RING_GET_RESPONSE(&ring->fe_ring, ring->fe_ring.rsp_prod_pvt);
+       memcpy(rspp, rsp, sizeof(blkif_response_t));
+       ring->fe_ring.rsp_prod_pvt++;
+}
+
+static void
+tapdisk_vbd_callback(void *arg, blkif_response_t *rsp)
+{
+       td_vbd_t *vbd = (td_vbd_t *)arg;
+       tapdisk_vbd_write_response_to_ring(vbd, rsp);
+}
+
+static void
+tapdisk_vbd_make_response(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+       blkif_request_t tmp;
+       blkif_response_t *rsp;
+
+       tmp = vreq->req;
+       rsp = (blkif_response_t *)&vreq->req;
+
+       rsp->id = tmp.id;
+       rsp->operation = tmp.operation;
+       rsp->status = vreq->status;
+
+       DBG(TLOG_DBG, "writing req %d, sec 0x%08"PRIx64", res %d to ring\n",
+           (int)tmp.id, tmp.sector_number, vreq->status);
+
+       if (rsp->status != BLKIF_RSP_OKAY)
+               ERR(EIO, "returning BLKIF_RSP %d", rsp->status);
+
+       vbd->returned++;
+       vbd->callback(vbd->argument, rsp);
+}
+
+void
+tapdisk_vbd_check_state(td_vbd_t *vbd)
+{
+       td_vbd_request_t *vreq, *tmp;
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests)
+               if (vreq->num_retries >= TD_VBD_MAX_RETRIES)
+                       tapdisk_vbd_complete_vbd_request(vbd, vreq);
+
+       if (!list_empty(&vbd->new_requests) ||
+           !list_empty(&vbd->failed_requests))
+               tapdisk_vbd_issue_requests(vbd);
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->completed_requests) {
+               tapdisk_vbd_make_response(vbd, vreq);
+               list_del(&vreq->next);
+               tapdisk_vbd_initialize_vreq(vreq);
+       }
+
+       if (td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED))
+               tapdisk_vbd_quiesce_queue(vbd);
+
+       if (td_flag_test(vbd->state, TD_VBD_PAUSE_REQUESTED))
+               tapdisk_vbd_pause(vbd);
+
+       if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+               tapdisk_vbd_close(vbd);
+}
+
+void
+tapdisk_vbd_check_progress(td_vbd_t *vbd)
+{
+       int diff;
+       struct timeval now;
+
+       if (list_empty(&vbd->pending_requests))
+               return;
+
+       gettimeofday(&now, NULL);
+       diff = now.tv_sec - vbd->ts.tv_sec;
+
+       if (diff >= TD_VBD_WATCHDOG_TIMEOUT) {
+               DBG(TLOG_WARN, "%s: watchdog timeout: pending requests "
+                   "idle for %d seconds\n", vbd->name, diff);
+               tapdisk_vbd_drop_log(vbd);
+               return;
+       }
+
+       tapdisk_server_set_max_timeout(TD_VBD_WATCHDOG_TIMEOUT - diff);
+}
+
+/*
+ * request submission 
+ */
+
+static int
+tapdisk_vbd_check_queue(td_vbd_t *vbd)
+{
+       int err;
+       td_image_t *image;
+
+       if (list_empty(&vbd->images))
+               return -ENOSYS;
+
+       if (!tapdisk_vbd_queue_ready(vbd))
+               return -EAGAIN;
+
+       if (!vbd->reopened) {
+               if (td_flag_test(vbd->state, TD_VBD_LOCKING)) {
+                       err = tapdisk_vbd_lock(vbd);
+                       if (err)
+                               return err;
+               }
+
+               image = tapdisk_vbd_first_image(vbd);
+               td_flag_set(image->flags, TD_OPEN_STRICT);
+
+               if (tapdisk_vbd_close_and_reopen_image(vbd, image))
+                       EPRINTF("reopening disks failed\n");
+               else {
+                       DPRINTF("reopening disks succeeded\n");
+                       vbd->reopened = 1;
+               }
+       }
+
+       return 0;
+}
+
+void
+tapdisk_vbd_complete_vbd_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+       if (!vreq->submitting && !vreq->secs_pending) {
+               if (vreq->status == BLKIF_RSP_ERROR &&
+                   vreq->num_retries < TD_VBD_MAX_RETRIES &&
+                   !td_flag_test(vbd->state, TD_VBD_DEAD) &&
+                   !td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+                       tapdisk_vbd_move_request(vreq, &vbd->failed_requests);
+               else
+                       tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+       }
+}
+
+static uint64_t 
+tapdisk_vbd_breq_get_sector(blkif_request_t *breq, td_request_t treq)
+{
+    int seg, nsects; 
+    uint64_t sector_nr = breq->sector_number; 
+    
+    for(seg=0; seg < treq.sidx; seg++) {
+        nsects = breq->seg[seg].last_sect - breq->seg[seg].first_sect + 1;
+        sector_nr += nsects;
+    }
+
+    return sector_nr;
+}
+
+static void
+__tapdisk_vbd_complete_td_request(td_vbd_t *vbd, td_vbd_request_t *vreq,
+                                 td_request_t treq, int res)
+{
+       int err;
+    td_image_t *image = treq.image;
+
+       err = (res <= 0 ? res : -res);
+       vbd->secs_pending  -= treq.secs;
+       vreq->secs_pending -= treq.secs;
+
+       vreq->blocked = treq.blocked;
+
+       if (err) {
+               vreq->status = BLKIF_RSP_ERROR;
+               vreq->error  = (vreq->error ? : err);
+               if (err != -EBUSY) {
+                       vbd->errors++;
+                       ERR(err, "req %"PRIu64": %s 0x%04x secs to "
+                           "0x%08"PRIx64, vreq->req.id,
+                           (treq.op == TD_OP_WRITE ? "write" : "read"),
+                           treq.secs, treq.sec);
+               }
+       } else {
+#ifdef MEMSHR
+               if (treq.op == TD_OP_READ
+                  && td_flag_test(image->flags, TD_OPEN_RDONLY)) {
+                       share_tuple_t hnd = treq.memshr_hnd;
+                       uint16_t uid  = image->memshr_id;
+                       blkif_request_t *breq = &vreq->req;
+                       uint64_t sec  = tapdisk_vbd_breq_get_sector(breq, treq);
+                       int secs = breq->seg[treq.sidx].last_sect -
+                           breq->seg[treq.sidx].first_sect + 1;
+
+                       if (hnd.handle != 0)
+                               memshr_vbd_complete_ro_request(hnd, uid,
+                                                               sec, secs);
+               }
+#endif
+       }
+
+       tapdisk_vbd_complete_vbd_request(vbd, vreq);
+}
+
+static void
+__tapdisk_vbd_reissue_td_request(td_vbd_t *vbd,
+                                td_image_t *image, td_request_t treq)
+{
+       td_image_t *parent;
+       td_vbd_request_t *vreq;
+
+       vreq = (td_vbd_request_t *)treq.private;
+       gettimeofday(&vreq->last_try, NULL);
+
+       vreq->submitting++;
+
+       if (tapdisk_vbd_is_last_image(vbd, image)) {
+               memset(treq.buf, 0, treq.secs << SECTOR_SHIFT);
+               td_complete_request(treq, 0);
+               goto done;
+       }
+
+       parent     = tapdisk_vbd_next_image(image);
+       treq.image = parent;
+
+       /* return zeros for requests that extend beyond end of parent image */
+       if (treq.sec + treq.secs > parent->info.size) {
+               td_request_t clone  = treq;
+
+               if (parent->info.size > treq.sec) {
+                       int secs    = parent->info.size - treq.sec;
+                       clone.sec  += secs;
+                       clone.secs -= secs;
+                       clone.buf  += (secs << SECTOR_SHIFT);
+                       treq.secs   = secs;
+               } else
+                       treq.secs   = 0;
+
+               memset(clone.buf, 0, clone.secs << SECTOR_SHIFT);
+               td_complete_request(clone, 0);
+
+               if (!treq.secs)
+                       goto done;
+       }
+
+       switch (treq.op) {
+       case TD_OP_WRITE:
+               td_queue_write(parent, treq);
+               break;
+
+       case TD_OP_READ:
+#ifdef MEMSHR
+               if(td_flag_test(parent->flags, TD_OPEN_RDONLY)) {
+                       int ret, seg = treq.sidx;
+                       blkif_request_t *breq = &vreq->req;
+        
+                       ret = memshr_vbd_issue_ro_request(treq.buf,
+                             breq->seg[seg].gref,
+                             parent->memshr_id,
+                             treq.sec,
+                             treq.secs,
+                             &treq.memshr_hnd);
+                       if(ret == 0) {
+                               /* Reset memshr handle. This'll prevent
+                                * memshr_vbd_complete_ro_request being called
+                                */
+                               treq.memshr_hnd.handle = 0;
+                               td_complete_request(treq, 0);
+                       } else
+                               td_queue_read(parent, treq);
+               } else
+#endif
+                       td_queue_read(parent, treq);
+               break;
+       }
+
+done:
+       vreq->submitting--;
+       if (!vreq->secs_pending)
+               tapdisk_vbd_complete_vbd_request(vbd, vreq);
+}
+
+void
+tapdisk_vbd_forward_request(td_request_t treq)
+{
+       td_vbd_t *vbd;
+       td_image_t *image;
+       td_vbd_request_t *vreq;
+
+       image = treq.image;
+       vbd   = (td_vbd_t *)image->private;
+       vreq  = (td_vbd_request_t *)treq.private;
+
+       gettimeofday(&vbd->ts, NULL);
+
+       if (tapdisk_vbd_queue_ready(vbd))
+               __tapdisk_vbd_reissue_td_request(vbd, image, treq);
+       else
+               __tapdisk_vbd_complete_td_request(vbd, vreq, treq, -EIO);
+}
+
+static void
+tapdisk_vbd_complete_td_request(td_request_t treq, int res)
+{
+       td_vbd_t *vbd;
+       td_image_t *image;
+       td_vbd_request_t *vreq;
+
+       image = treq.image;
+       vbd   = (td_vbd_t *)image->private;
+       vreq  = (td_vbd_request_t *)treq.private;
+
+       gettimeofday(&vbd->ts, NULL);
+       DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" "
+           "secs 0x%04x buf %p op %d res %d\n", image->name,
+           (int)treq.id, treq.sidx, treq.sec, treq.secs,
+           treq.buf, (int)vreq->req.operation, res);
+
+       __tapdisk_vbd_complete_td_request(vbd, vreq, treq, res);
+}
+
+static int
+tapdisk_vbd_issue_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+       char *page;
+       td_ring_t *ring;
+       td_image_t *image;
+       td_request_t treq;
+       uint64_t sector_nr;
+       blkif_request_t *req;
+       int i, err, id, nsects;
+
+       req       = &vreq->req;
+       id        = req->id;
+       ring      = &vbd->ring;
+       sector_nr = req->sector_number;
+       image     = tapdisk_vbd_first_image(vbd);
+
+       vreq->submitting = 1;
+       gettimeofday(&vbd->ts, NULL);
+       gettimeofday(&vreq->last_try, NULL);
+       tapdisk_vbd_move_request(vreq, &vbd->pending_requests);
+
+#if 0
+       err = tapdisk_vbd_check_queue(vbd);
+       if (err)
+               goto fail;
+#endif
+
+       err = tapdisk_image_check_ring_request(image, req);
+       if (err)
+               goto fail;
+
+       for (i = 0; i < req->nr_segments; i++) {
+               nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1;
+               page   = (char *)MMAP_VADDR(ring->vstart, 
+                                          (unsigned long)req->id, i);
+               page  += (req->seg[i].first_sect << SECTOR_SHIFT);
+
+               treq.id             = id;
+               treq.sidx           = i;
+               treq.blocked        = 0;
+               treq.buf            = page;
+               treq.sec            = sector_nr;
+               treq.secs           = nsects;
+               treq.image          = image;
+               treq.cb             = tapdisk_vbd_complete_td_request;
+               treq.cb_data        = NULL;
+               treq.private        = vreq;
+
+               DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" secs 0x%04x "
+                   "buf %p op %d\n", image->name, id, i, treq.sec, treq.secs,
+                   treq.buf, (int)req->operation);
+
+               vreq->secs_pending += nsects;
+               vbd->secs_pending  += nsects;
+
+               switch (req->operation) {
+               case BLKIF_OP_WRITE:
+                       treq.op = TD_OP_WRITE;
+                       td_queue_write(image, treq);
+                       break;
+
+               case BLKIF_OP_READ:
+                       treq.op = TD_OP_READ;
+                       td_queue_read(image, treq);
+                       break;
+               }
+
+               sector_nr += nsects;
+       }
+
+       err = 0;
+
+out:
+       vreq->submitting--;
+       if (!vreq->secs_pending) {
+               err = (err ? : vreq->error);
+               tapdisk_vbd_complete_vbd_request(vbd, vreq);
+       }
+
+       return err;
+
+fail:
+       vreq->status = BLKIF_RSP_ERROR;
+       goto out;
+}
+
+static int
+tapdisk_vbd_reissue_failed_requests(td_vbd_t *vbd)
+{
+       int err;
+       struct timeval now;
+       td_vbd_request_t *vreq, *tmp;
+
+       err = 0;
+       gettimeofday(&now, NULL);
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) {
+               if (vreq->secs_pending)
+                       continue;
+
+               if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+                       goto fail;
+
+               if (vreq->error != -EBUSY &&
+                   now.tv_sec - vreq->last_try.tv_sec < TD_VBD_RETRY_INTERVAL)
+                       continue;
+
+               if (vreq->num_retries >= TD_VBD_MAX_RETRIES) {
+               fail:
+                       DBG(TLOG_INFO, "req %"PRIu64"retried %d times\n",
+                           vreq->req.id, vreq->num_retries);
+                       tapdisk_vbd_complete_vbd_request(vbd, vreq);
+                       continue;
+               }
+
+               /*
+                * never fail due to too many retries if we are blocked on a 
+                * dependency
+                */
+               if (vreq->blocked) {
+                       vreq->blocked = 0;
+               } else {
+                       vbd->retries++;
+                       vreq->num_retries++;
+               }
+               vreq->error  = 0;
+               vreq->status = BLKIF_RSP_OKAY;
+               DBG(TLOG_DBG, "retry #%d of req %"PRIu64", "
+                   "sec 0x%08"PRIx64", nr_segs: %d\n", vreq->num_retries,
+                   vreq->req.id, vreq->req.sector_number,
+                   vreq->req.nr_segments);
+
+               err = tapdisk_vbd_issue_request(vbd, vreq);
+               if (err)
+                       break;
+       }
+
+       if (list_empty(&vbd->failed_requests))
+               td_flag_clear(vbd->state, TD_VBD_RETRY_NEEDED);
+       else
+               td_flag_set(vbd->state, TD_VBD_RETRY_NEEDED);
+
+       return err;
+}
+
+static int
+tapdisk_vbd_issue_new_requests(td_vbd_t *vbd)
+{
+       int err;
+       td_vbd_request_t *vreq, *tmp;
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) {
+               err = tapdisk_vbd_issue_request(vbd, vreq);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_vbd_kill_requests(td_vbd_t *vbd)
+{
+       td_vbd_request_t *vreq, *tmp;
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) {
+               vreq->status = BLKIF_RSP_ERROR;
+               tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+       }
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) {
+               vreq->status = BLKIF_RSP_ERROR;
+               tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+       }
+
+       return 0;
+}
+
+int
+tapdisk_vbd_issue_requests(td_vbd_t *vbd)
+{
+       int err;
+
+       if (td_flag_test(vbd->state, TD_VBD_DEAD))
+               return tapdisk_vbd_kill_requests(vbd);
+
+       if (!tapdisk_vbd_queue_ready(vbd))
+               return -EAGAIN;
+
+       err = tapdisk_vbd_reissue_failed_requests(vbd);
+       if (err)
+               return err;
+
+       return tapdisk_vbd_issue_new_requests(vbd);
+}
+
+static void
+tapdisk_vbd_pull_ring_requests(td_vbd_t *vbd)
+{
+       int idx;
+       RING_IDX rp, rc;
+       td_ring_t *ring;
+       blkif_request_t *req;
+       td_vbd_request_t *vreq;
+
+       ring = &vbd->ring;
+       if (!ring->sring)
+               return;
+
+       rp   = ring->fe_ring.sring->req_prod;
+       xen_rmb();
+
+       for (rc = ring->fe_ring.req_cons; rc != rp; rc++) {
+               req = RING_GET_REQUEST(&ring->fe_ring, rc);
+               ++ring->fe_ring.req_cons;
+
+               idx  = req->id;
+               vreq = &vbd->request_list[idx];
+
+               ASSERT(list_empty(&vreq->next));
+               ASSERT(vreq->secs_pending == 0);
+
+               memcpy(&vreq->req, req, sizeof(blkif_request_t));
+               vbd->received++;
+               vreq->vbd = vbd;
+
+               tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+
+               DBG(TLOG_DBG, "%s: request %d \n", vbd->name, idx);
+       }
+}
+
+static int
+tapdisk_vbd_pause_ring(td_vbd_t *vbd)
+{
+       int err;
+
+       if (td_flag_test(vbd->state, TD_VBD_PAUSED))
+               return 0;
+
+       td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
+
+       err = tapdisk_vbd_quiesce_queue(vbd);
+       if (err) {
+               EPRINTF("%s: ring pause request on active queue\n", vbd->name);
+               return err;
+       }
+
+       tapdisk_vbd_close_vdi(vbd);
+
+       err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_PAUSE, 0);
+       if (err)
+               EPRINTF("%s: pause ioctl failed: %d\n", vbd->name, errno);
+       else {
+               td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+               td_flag_set(vbd->state, TD_VBD_PAUSED);
+       }
+
+       return err;
+}
+
+static int
+tapdisk_vbd_resume_ring(td_vbd_t *vbd)
+{
+       int i, err, type;
+       char message[BLKTAP2_MAX_MESSAGE_LEN];
+       const char *path;
+
+       memset(message, 0, sizeof(message));
+
+       if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
+               EPRINTF("%s: resume message for unpaused vbd\n", vbd->name);
+               return -EINVAL;
+       }
+
+       err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_REOPEN, &message);
+       if (err) {
+               EPRINTF("%s: resume ioctl failed: %d\n", vbd->name, errno);
+               return err;
+       }
+
+       err = tapdisk_parse_disk_type(message, &path, &type);
+       if (err) {
+               EPRINTF("%s: invalid resume string %s\n", vbd->name, message);
+               goto out;
+       }
+
+       free(vbd->name);
+       vbd->name = strdup(path);
+       if (!vbd->name) {
+               EPRINTF("resume malloc failed\n");
+               err = -ENOMEM;
+               goto out;
+       }
+
+       tapdisk_vbd_start_queue(vbd);
+
+       for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+               err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT);
+               if (err != -EIO)
+                       break;
+
+               sleep(TD_VBD_EIO_SLEEP);
+       }
+
+out:
+       if (!err) {
+               image_t image;
+               struct blktap2_params params;
+
+               memset(&params, 0, sizeof(params));
+               tapdisk_vbd_get_image_info(vbd, &image);
+
+               params.sector_size = image.secsize;
+               params.capacity    = image.size;
+               snprintf(params.name, sizeof(params.name) - 1, "%s", message);
+
+               ioctl(vbd->ring.fd, BLKTAP2_IOCTL_SET_PARAMS, &params);
+               td_flag_clear(vbd->state, TD_VBD_PAUSED);
+       }
+
+       ioctl(vbd->ring.fd, BLKTAP2_IOCTL_RESUME, err);
+       return err;
+}
+
+static int
+tapdisk_vbd_check_ring_message(td_vbd_t *vbd)
+{
+       if (!vbd->ring.sring)
+               return -EINVAL;
+
+       switch (vbd->ring.sring->pvt.tapif_user.msg) {
+       case 0:
+               return 0;
+
+       case BLKTAP2_RING_MESSAGE_PAUSE:
+               return tapdisk_vbd_pause_ring(vbd);
+
+       case BLKTAP2_RING_MESSAGE_RESUME:
+               return tapdisk_vbd_resume_ring(vbd);
+
+       case BLKTAP2_RING_MESSAGE_CLOSE:
+               return tapdisk_vbd_close(vbd);
+
+       default:
+               return -EINVAL;
+       }
+}
+
+static void
+tapdisk_vbd_ring_event(event_id_t id, char mode, void *private)
+{
+       td_vbd_t *vbd;
+
+       vbd = (td_vbd_t *)private;
+
+       tapdisk_vbd_pull_ring_requests(vbd);
+       tapdisk_vbd_issue_requests(vbd);
+
+       /* vbd may be destroyed after this call */
+       tapdisk_vbd_check_ring_message(vbd);
+}
+
+td_image_t *
+tapdisk_vbd_first_image(td_vbd_t *vbd)
+{
+       return list_entry(vbd->images.next, td_image_t, next);
+}
diff --git a/tools/blktap2/drivers/tapdisk-vbd.h b/tools/blktap2/drivers/tapdisk-vbd.h
new file mode 100644 (file)
index 0000000..be084b2
--- /dev/null
@@ -0,0 +1,207 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_VBD_H_
+#define _TAPDISK_VBD_H_
+
+#include <sys/time.h>
+#include <xenctrl.h>
+#include <xen/io/blkif.h>
+
+#include "tapdisk.h"
+#include "scheduler.h"
+#include "tapdisk-image.h"
+
+#define TD_VBD_MAX_RETRIES          100
+#define TD_VBD_RETRY_INTERVAL       1
+
+#define TD_VBD_DEAD                 0x0001
+#define TD_VBD_CLOSED               0x0002
+#define TD_VBD_QUIESCE_REQUESTED    0x0004
+#define TD_VBD_QUIESCED             0x0008
+#define TD_VBD_PAUSE_REQUESTED      0x0010
+#define TD_VBD_PAUSED               0x0020
+#define TD_VBD_SHUTDOWN_REQUESTED   0x0040
+#define TD_VBD_LOCKING              0x0080
+#define TD_VBD_RETRY_NEEDED         0x0100
+#define TD_VBD_LOG_DROPPED          0x0200
+
+typedef struct td_ring              td_ring_t;
+typedef struct td_vbd_request       td_vbd_request_t;
+typedef struct td_vbd_driver_info   td_vbd_driver_info_t;
+typedef struct td_vbd_handle        td_vbd_t;
+typedef void (*td_vbd_cb_t)        (void *, blkif_response_t *);
+
+struct td_ring {
+       int                         fd;
+       char                       *mem;
+       blkif_sring_t              *sring;
+       blkif_back_ring_t           fe_ring;
+       unsigned long               vstart;
+};
+
+struct td_vbd_request {
+       blkif_request_t             req;
+       int16_t                     status;
+
+       int                         error;
+       int                         blocked; /* blocked on a dependency */
+       int                         submitting;
+       int                         secs_pending;
+       int                         num_retries;
+       struct timeval              last_try;
+
+       td_vbd_t                   *vbd;
+       struct list_head            next;
+};
+
+struct td_vbd_driver_info {
+       char                       *params;
+       int                         type;
+       struct list_head            next;
+};
+
+struct td_vbd_handle {
+       char                       *name;
+
+       td_uuid_t                   uuid;
+       int                         minor;
+
+       struct list_head            driver_stack;
+
+       int                         storage;
+
+       uint8_t                     reopened;
+       uint8_t                     reactivated;
+       td_flag_t                   flags;
+       td_flag_t                   state;
+
+       struct list_head            images;
+
+       struct list_head            new_requests;
+       struct list_head            pending_requests;
+       struct list_head            failed_requests;
+       struct list_head            completed_requests;
+
+       td_vbd_request_t            request_list[MAX_REQUESTS];
+
+       td_ring_t                   ring;
+       event_id_t                  ring_event_id;
+
+       td_vbd_cb_t                 callback;
+       void                       *argument;
+
+       struct list_head            next;
+
+       struct timeval              ts;
+
+       uint64_t                    received;
+       uint64_t                    returned;
+       uint64_t                    kicked;
+       uint64_t                    secs_pending;
+       uint64_t                    retries;
+       uint64_t                    errors;
+};
+
+#define tapdisk_vbd_for_each_request(vreq, tmp, list)                  \
+       list_for_each_entry_safe((vreq), (tmp), (list), next)
+
+#define tapdisk_vbd_for_each_image(vbd, image, tmp)                    \
+       list_for_each_entry_safe((image), (tmp), &(vbd)->images, next)
+
+static inline void
+tapdisk_vbd_move_request(td_vbd_request_t *vreq, struct list_head *dest)
+{
+       list_del(&vreq->next);
+       INIT_LIST_HEAD(&vreq->next);
+       list_add_tail(&vreq->next, dest);
+}
+
+static inline void
+tapdisk_vbd_add_image(td_vbd_t *vbd, td_image_t *image)
+{
+       list_add_tail(&image->next, &vbd->images);
+}
+
+static inline int
+tapdisk_vbd_is_last_image(td_vbd_t *vbd, td_image_t *image)
+{
+       return list_is_last(&image->next, &vbd->images);
+}
+
+td_image_t *
+tapdisk_vbd_first_image(td_vbd_t *vbd);
+
+static inline td_image_t *
+tapdisk_vbd_last_image(td_vbd_t *vbd)
+{
+       return list_entry(vbd->images.prev, td_image_t, next);
+}
+
+static inline td_image_t *
+tapdisk_vbd_next_image(td_image_t *image)
+{
+       return list_entry(image->next.next, td_image_t, next);
+}
+
+td_vbd_t *tapdisk_vbd_create(td_uuid_t);
+int tapdisk_vbd_initialize(td_uuid_t);
+void tapdisk_vbd_set_callback(td_vbd_t *, td_vbd_cb_t, void *);
+int tapdisk_vbd_parse_stack(td_vbd_t *vbd, const char *path);
+int tapdisk_vbd_open(td_vbd_t *, const char *, uint16_t,
+                    uint16_t, int, const char *, td_flag_t);
+int tapdisk_vbd_close(td_vbd_t *);
+void tapdisk_vbd_free(td_vbd_t *);
+void tapdisk_vbd_free_stack(td_vbd_t *);
+
+int tapdisk_vbd_open_stack(td_vbd_t *, uint16_t, td_flag_t);
+int tapdisk_vbd_open_vdi(td_vbd_t *, const char *,
+                        uint16_t, uint16_t, td_flag_t);
+void tapdisk_vbd_close_vdi(td_vbd_t *);
+
+int tapdisk_vbd_attach(td_vbd_t *, const char *, int);
+void tapdisk_vbd_detach(td_vbd_t *);
+
+void tapdisk_vbd_forward_request(td_request_t);
+
+int tapdisk_vbd_get_image_info(td_vbd_t *, image_t *);
+int tapdisk_vbd_queue_ready(td_vbd_t *);
+int tapdisk_vbd_retry_needed(td_vbd_t *);
+int tapdisk_vbd_quiesce_queue(td_vbd_t *);
+int tapdisk_vbd_start_queue(td_vbd_t *);
+int tapdisk_vbd_issue_requests(td_vbd_t *);
+int tapdisk_vbd_kill_queue(td_vbd_t *);
+int tapdisk_vbd_pause(td_vbd_t *);
+int tapdisk_vbd_resume(td_vbd_t *, const char *, uint16_t);
+int tapdisk_vbd_kick(td_vbd_t *);
+void tapdisk_vbd_check_state(td_vbd_t *);
+void tapdisk_vbd_check_progress(td_vbd_t *);
+void tapdisk_vbd_debug(td_vbd_t *);
+
+void tapdisk_vbd_complete_vbd_request(td_vbd_t *, td_vbd_request_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk.h b/tools/blktap2/drivers/tapdisk.h
new file mode 100644 (file)
index 0000000..66d508e
--- /dev/null
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * Some notes on the tap_disk interface:
+ * 
+ * tap_disk aims to provide a generic interface to easily implement new 
+ * types of image accessors.  The structure-of-function-calls is similar
+ * to disk interfaces used in qemu/denali/etc, with the significant 
+ * difference being the expectation of asynchronous rather than synchronous 
+ * I/O.  The asynchronous interface is intended to allow lots of requests to
+ * be pipelined through a disk, without the disk requiring any of its own
+ * threads of control.  As such, a batch of requests is delivered to the disk
+ * using:
+ * 
+ *    td_queue_[read,write]()
+ * 
+ * and passing in a completion callback, which the disk is responsible for 
+ * tracking.  Disks should transform these requests as necessary and return
+ * the resulting iocbs to tapdisk using td_prep_[read,write]() and 
+ * td_queue_tiocb().
+ *
+ * NOTE: tapdisk uses the number of sectors submitted per request as a 
+ * ref count.  Plugins must use the callback function to communicate the
+ * completion -- or error -- of every sector submitted to them.
+ *
+ * td_get_parent_id returns:
+ *     0 if parent id successfully retrieved
+ *     TD_NO_PARENT if no parent exists
+ *     -errno on error
+ */
+
+#ifndef _TAPDISK_H_
+#define _TAPDISK_H_
+
+#include <time.h>
+#include <stdint.h>
+
+#include "list.h"
+#include "blktaplib.h"
+#include "tapdisk-log.h"
+#include "tapdisk-utils.h"
+
+#ifdef MEMSHR
+#include "memshr.h"
+#endif
+
+#define DPRINTF(_f, _a...)           syslog(LOG_INFO, _f, ##_a)
+#define EPRINTF(_f, _a...)           syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a)
+#define PERROR(_f, _a...)            EPRINTF(_f ": %s", ##_a, strerror(errno))
+
+#define MAX_SEGMENTS_PER_REQ         11
+#define SECTOR_SHIFT                 9
+#define DEFAULT_SECTOR_SIZE          512
+
+#define TAPDISK_DATA_REQUESTS       (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
+
+//#define BLK_NOT_ALLOCATED            (-99)
+#define TD_NO_PARENT                 1
+
+#define MAX_RAMDISK_SIZE             1024000 /*500MB disk limit*/
+
+#define TD_OP_READ                   0
+#define TD_OP_WRITE                  1
+
+#define TD_OPEN_QUIET                0x00001
+#define TD_OPEN_QUERY                0x00002
+#define TD_OPEN_RDONLY               0x00004
+#define TD_OPEN_STRICT               0x00008
+#define TD_OPEN_SHAREABLE            0x00010
+#define TD_OPEN_ADD_CACHE            0x00020
+#define TD_OPEN_VHD_INDEX            0x00040
+#define TD_OPEN_LOG_DIRTY            0x00080
+
+#define TD_CREATE_SPARSE             0x00001
+#define TD_CREATE_MULTITYPE          0x00002
+
+#define td_flag_set(word, flag)      ((word) |= (flag))
+#define td_flag_clear(word, flag)    ((word) &= ~(flag))
+#define td_flag_test(word, flag)     ((word) & (flag))
+
+typedef uint16_t                     td_uuid_t;
+typedef uint32_t                     td_flag_t;
+typedef uint64_t                     td_sector_t;
+typedef struct td_disk_id            td_disk_id_t;
+typedef struct td_disk_info          td_disk_info_t;
+typedef struct td_request            td_request_t;
+typedef struct td_driver_handle      td_driver_t;
+typedef struct td_image_handle       td_image_t;
+
+struct td_disk_id {
+       char                        *name;
+       int                          drivertype;
+};
+
+struct td_disk_info {
+       td_sector_t                  size;
+        uint64_t                     sector_size;
+       uint32_t                     info;
+};
+
+struct td_request {
+       int                          op;
+       char                        *buf;
+       td_sector_t                  sec;
+       int                          secs;
+
+       uint8_t                      blocked; /* blocked on a dependency */
+
+       td_image_t                  *image;
+
+       void * /*td_callback_t*/     cb;
+       void                        *cb_data;
+
+       uint64_t                     id;
+       int                          sidx;
+       void                        *private;
+    
+#ifdef MEMSHR
+       share_tuple_t                memshr_hnd;
+#endif
+};
+
+/* 
+ * Prototype of the callback to activate as requests complete.
+ */
+typedef void (*td_callback_t)(td_request_t, int);
+
+/* 
+ * Structure describing the interface to a virtual disk implementation.
+ * See note at the top of this file describing this interface.
+ */
+struct tap_disk {
+       const char                  *disk_type;
+       td_flag_t                    flags;
+       int                          private_data_size;
+       int (*td_open)               (td_driver_t *, const char *, td_flag_t);
+       int (*td_close)              (td_driver_t *);
+       int (*td_get_parent_id)      (td_driver_t *, td_disk_id_t *);
+       int (*td_validate_parent)    (td_driver_t *, td_driver_t *, td_flag_t);
+       void (*td_queue_read)        (td_driver_t *, td_request_t);
+       void (*td_queue_write)       (td_driver_t *, td_request_t);
+       void (*td_debug)             (td_driver_t *);
+};
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk2.c b/tools/blktap2/drivers/tapdisk2.c
new file mode 100644 (file)
index 0000000..aebd861
--- /dev/null
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#ifdef MEMSHR
+#include <memshr.h>
+#endif
+
+#include "tapdisk.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+#include "tapdisk-control.h"
+
+static void
+usage(const char *app, int err)
+{
+       fprintf(stderr, "usage: %s [-D] <-u uuid> <-c control socket>\n", app);
+       exit(err);
+}
+
+int
+main(int argc, char *argv[])
+{
+       char *control;
+       int c, err, nodaemon;
+
+       control  = NULL;
+       nodaemon = 0;
+
+       while ((c = getopt(argc, argv, "s:Dh")) != -1) {
+               switch (c) {
+               case 'D':
+                       nodaemon = 1;
+                       break;
+               case 'h':
+                       usage(argv[0], 0);
+                       break;
+               case 's':
+#ifdef MEMSHR
+                       memshr_set_domid(atoi(optarg));
+#else
+                       fprintf(stderr, "MEMSHR support not compiled in.\n");
+                       exit(EXIT_FAILURE);
+#endif
+                       break;
+               default:
+                       usage(argv[0], EINVAL);
+               }
+       }
+
+       if (optind != argc)
+               usage(argv[0], EINVAL);
+
+       if (chdir("/")) {
+               DPRINTF("failed to chdir(/): %d\n", errno);
+               err = 1;
+               goto out;
+       }
+
+       tapdisk_start_logging("tapdisk2");
+
+       err = tapdisk_server_init();
+       if (err) {
+               DPRINTF("failed to initialize server: %d\n", err);
+               goto out;
+       }
+
+       if (!nodaemon) {
+               err = daemon(0, 1);
+               if (err) {
+                       DPRINTF("failed to daemonize: %d\n", errno);
+                       goto out;
+               }
+       }
+
+       err = tapdisk_control_open(&control);
+       if (err) {
+               DPRINTF("failed to open control socket: %d\n", err);
+               goto out;
+       }
+
+       fprintf(stdout, "%s\n", control);
+       fflush(stdout);
+
+       if (!nodaemon) {
+               int fd;
+
+               fd = open("/dev/null", O_RDWR);
+               if (fd != -1) {
+                       dup2(fd, STDIN_FILENO);
+                       dup2(fd, STDOUT_FILENO);
+                       dup2(fd, STDERR_FILENO);
+                       if (fd > 2)
+                               close(fd);
+               }
+       }
+
+       err = tapdisk_server_complete();
+       if (err) {
+               DPRINTF("failed to complete server: %d\n", err);
+               goto out;
+       }
+
+       err = tapdisk_server_run();
+
+out:
+       tapdisk_control_close();
+       tapdisk_stop_logging();
+       return err;
+}
diff --git a/tools/blktap2/drivers/td.c b/tools/blktap2/drivers/td.c
new file mode 100644 (file)
index 0000000..f920acd
--- /dev/null
@@ -0,0 +1,691 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+#include "tapdisk-utils.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stdout, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+typedef enum {
+       TD_FIELD_HIDDEN  = 0,
+       TD_FIELD_INVALID = 1
+} td_field_t;
+
+struct vdi_field {
+       char       *name;
+       td_field_t  id;
+};
+
+static struct vdi_field td_vdi_fields[TD_FIELD_INVALID] = {
+       { .id = TD_FIELD_HIDDEN, .name = "hidden" }
+};
+
+typedef enum {
+       TD_CMD_CREATE    = 0,
+       TD_CMD_SNAPSHOT,
+/*     TD_CMD_COALESCE,       */
+       TD_CMD_QUERY,
+/*     TD_CMD_RESIZE,         */
+       TD_CMD_SET,
+/*     TD_CMD_REPAIR,         */
+/*     TD_CMD_FILL,           */
+/*     TD_CMD_READ,           */
+       TD_CMD_INVALID,
+} td_command_t;
+
+struct command {
+       td_command_t  id;
+       char         *name;
+       int           needs_type;
+};
+
+struct command commands[TD_CMD_INVALID] = {
+       { .id = TD_CMD_CREATE,   .name = "create",   .needs_type = 1 },
+       { .id = TD_CMD_SNAPSHOT, .name = "snapshot", .needs_type = 1 },
+/*     { .id = TD_CMD_COALESCE, .name = "coalesce", .needs_type = 1 },    */
+       { .id = TD_CMD_QUERY,    .name = "query",    .needs_type = 1 },
+/*     { .id = TD_CMD_RESIZE,   .name = "resize",   .needs_type = 1 },    */
+       { .id = TD_CMD_SET,      .name = "set",      .needs_type = 1 },
+/*     { .id = TD_CMD_REPAIR,   .name = "repair",   .needs_type = 1 },    */
+/*     { .id = TD_CMD_FILL,     .name = "fill",     .needs_type = 1 },    */
+/*     { .id = TD_CMD_READ,     .name = "read",     .needs_type = 1 },    */
+};
+
+typedef enum {
+       TD_TYPE_VHD         = 0,
+       TD_TYPE_AIO,
+       TD_TYPE_INVALID,
+} td_disk_t;
+
+const char *td_disk_types[TD_TYPE_INVALID] = {
+       "vhd",
+       "aio",
+};
+
+#define print_commands()                                               \
+       do {                                                            \
+               int i;                                                  \
+               fprintf(stderr, "COMMAND := { ");                       \
+               fprintf(stderr, "%s", commands[0].name);                \
+               for (i = 1; i < TD_CMD_INVALID; i++)                    \
+                       fprintf(stderr, " | %s", commands[i].name);     \
+               fprintf(stderr, " }\n");                                \
+       } while (0)
+
+#define print_disk_types()                                             \
+       do {                                                            \
+               int i;                                                  \
+               fprintf(stderr, "TYPE := { ");                          \
+               fprintf(stderr, "%s", td_disk_types[0]);                \
+               for (i = 1; i < TD_TYPE_INVALID; i++)                   \
+                       fprintf(stderr, " | %s", td_disk_types[i]);     \
+               fprintf(stderr, " }\n");                                \
+       } while (0);
+
+#define print_field_names()                                            \
+       do {                                                            \
+               int i;                                                  \
+               fprintf(stderr, "FIELD := { ");                         \
+               fprintf(stderr, "%s", td_vdi_fields[0].name);           \
+               for (i = 1; i < TD_FIELD_INVALID; i++)                  \
+                       fprintf(stderr, " | %s", td_vdi_fields[i].name); \
+               fprintf(stderr, " }\n");                                \
+       } while (0)
+
+void 
+help(void)
+{
+       fprintf(stderr, "Tapdisk Utilities: v1.0.0\n");
+       fprintf(stderr, "usage: td-util COMMAND [TYPE] [OPTIONS]\n");
+       print_commands();
+       print_disk_types();
+       exit(-1);
+}
+
+struct command *
+get_command(char *command)
+{
+       int i;
+
+       for (i = 0; i < TD_CMD_INVALID; i++)
+               if (!strcmp(command, commands[i].name))
+                       return &commands[i];
+
+       return NULL;
+}
+
+struct vdi_field *
+get_field(char *field)
+{
+       int i;
+
+       for (i = 0; i < TD_FIELD_INVALID; i++)
+               if (!strcmp(field, td_vdi_fields[i].name))
+                       return &td_vdi_fields[i];
+
+       return NULL;
+}
+
+int
+get_driver_type(char *type)
+{
+       int i;
+
+       if (strnlen(type, 25) >= 25)
+               return -ENAMETOOLONG;
+
+       for (i = 0; i < TD_TYPE_INVALID; i++)
+               if (!strcmp(type, td_disk_types[i]))
+                       return i;
+
+       return -TD_TYPE_INVALID;
+}
+
+int
+td_create(int type, int argc, char *argv[])
+{
+       ssize_t mb;
+       uint64_t size;
+       char *name, *buf;
+       int c, i, fd, sparse = 1, fixedsize = 0;
+
+       while ((c = getopt(argc, argv, "hrb")) != -1) {
+               switch(c) {
+               case 'r':
+                       sparse = 0;
+                       break;
+               case 'b':
+                       fixedsize = 1;
+                       break;
+               default:
+                       fprintf(stderr, "Unknown option %c\n", (char)c);
+               case 'h':
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 2))
+               goto usage;
+
+       mb   = 1 << 20;
+       size = atoi(argv[optind++]);
+       size = size << 20;
+       name = argv[optind];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       if (type == TD_TYPE_VHD) {
+               int cargc = 0;
+               char sbuf[32], *cargv[10];
+
+               size >>= 20;
+
+               memset(cargv, 0, sizeof(cargv));
+               snprintf(sbuf, sizeof(sbuf) - 1, "%"PRIu64, size);
+               cargv[cargc++] = "create";
+               cargv[cargc++] = "-n";
+               cargv[cargc++] = name;
+               cargv[cargc++] = "-s";
+               cargv[cargc++] = sbuf;
+               if (!sparse)
+                       cargv[cargc++] = "-r";
+               if (fixedsize)
+                       cargv[cargc++] = "-b";
+
+               return vhd_util_create(cargc, cargv);
+       }
+
+       /* generic create */
+       if (sparse) {
+               fprintf(stderr, "Cannot create sparse %s image\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       buf = calloc(1, mb);
+       if (!buf)
+               return ENOMEM;
+
+       fd = open(name, O_WRONLY | O_DIRECT | O_CREAT | O_TRUNC, 0644);
+       if (fd == -1) {
+               free(buf);
+               return errno;
+       }
+
+       size >>= 20;
+       for (i = 0; i < size; i++)
+               if (write(fd, buf, mb) != mb) {
+                       close(fd);
+                       unlink(name);
+                       free(buf);
+                       return EIO;
+               }
+
+       close(fd);
+       free(buf);
+       return 0;
+
+ usage:
+       fprintf(stderr, "usage: td-util create %s [-h help] [-r reserve] "
+               "[-b file_is_fixed_size] <SIZE(MB)> <FILENAME>\n",
+               td_disk_types[type]);
+       return EINVAL;
+}
+
+int
+td_snapshot(int type, int argc, char *argv[])
+{
+       char *cargv[10];
+       int c, err, cargc;
+       struct stat stats;
+       char *name, *backing, *limit = NULL;
+       int fixedsize = 0, rawparent = 0;
+
+       if (type != TD_TYPE_VHD) {
+               fprintf(stderr, "Cannot create snapshot of %s image type\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       while ((c = getopt(argc, argv, "hbml:")) != -1) {
+               switch(c) {
+               case 'b':
+                       fixedsize = 1;
+                       break;
+               case 'm':
+                       rawparent = 1;
+                       break;
+               case 'l':
+                       limit = optarg;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 2)) {
+               err = EINVAL;
+               goto usage;
+       }
+
+       name    = argv[optind++];
+       backing = argv[optind++];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN ||
+           strnlen(backing, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       if (stat(backing, &stats) == -1) {
+               fprintf(stderr, "File %s not found\n", backing);
+               return errno;
+       }
+
+       cargc = 0;
+       memset(cargv, 0, sizeof(cargv));
+       cargv[cargc++] = "snapshot";
+       cargv[cargc++] = "-n";
+       cargv[cargc++] = name;
+       cargv[cargc++] = "-p";
+       cargv[cargc++] = backing;
+       if (fixedsize)
+               cargv[cargc++] = "-b";
+       if (rawparent)
+               cargv[cargc++] = "-m";
+       if (limit) {
+               cargv[cargc++] = "-l";
+               cargv[cargc++] = limit;
+       }
+       return vhd_util_snapshot(cargc, cargv);
+
+ usage:
+       fprintf(stderr, "usage: td-util snapshot %s [-h help] [-m parent_raw] "
+               "[-b file_is_fixed_size] [-l snapshot depth limit] "
+               "<FILENAME> <BACKING_FILENAME>\n", td_disk_types[type]);
+       return err;
+}
+
+int
+td_coalesce(int type, int argc, char *argv[])
+{
+       int c, ret, cargc;
+       char *name, *pname, *cargv[3];
+
+       if (type != TD_TYPE_VHD) {
+               fprintf(stderr, "Cannot create snapshot of %s image type\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       while ((c = getopt(argc, argv, "h")) != -1) {
+               switch(c) {
+               default:
+                       fprintf(stderr, "Unknown option %c\n", (char)c);
+               case 'h':
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 1))
+               goto usage;
+
+       name = argv[optind++];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       cargc = 0;
+       memset(cargv, 0, sizeof(cargv));
+       cargv[cargc++] = "coalesce";
+       cargv[cargc++] = "-n";
+       cargv[cargc++] = name;
+       ret = vhd_util_coalesce(cargc, cargv);
+       if (ret)
+               printf("coalesce failed: %d\n", ret);
+
+       return ret;
+
+ usage:
+       fprintf(stderr, "usage: td-util coalesce %s [-h help] "
+               "<FILENAME>\n", td_disk_types[type]);
+       return EINVAL;
+}
+
+int
+td_query(int type, int argc, char *argv[])
+{
+       char *name;
+       int c, size = 0, parent = 0, fields = 0, depth = 0, err = 0;
+
+       while ((c = getopt(argc, argv, "hvpfd")) != -1) {
+               switch(c) {
+               case 'v':
+                       size = 1;
+                       break;
+               case 'p':
+                       parent = 1;
+                       break;
+               case 'f':
+                       fields = 1;
+                       break;
+               case 'd':
+                       depth = 1;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 1)) {
+               err = EINVAL;
+               goto usage;
+       }
+
+       name = argv[optind++];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       if (type == TD_TYPE_VHD) {
+               vhd_context_t vhd;
+
+               err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+               if (err) {
+                       printf("failed opening %s: %d\n", name, err);
+                       return err;
+               }
+
+               if (size)
+                       printf("%"PRIu64"\n", vhd.footer.curr_size >> 20);
+
+               if (parent) {
+                       if (vhd.footer.type != HD_TYPE_DIFF)
+                               printf("%s has no parent\n", name);
+                       else {
+                               char *pname;
+
+                               err = vhd_parent_locator_get(&vhd, &pname);
+                               if (err)
+                                       printf("failed getting parent: %d\n",
+                                              err);
+                               else {
+                                       printf("%s\n", pname);
+                                       free(pname);
+                               }
+                       }
+               }
+
+               if (fields) {
+                       int ret, hidden;
+
+                       ret = vhd_hidden(&vhd, &hidden);
+                       if (ret) {
+                               printf("failed checking 'hidden' field: %d\n",
+                                      ret);
+                               err = (err ? : ret);
+                       } else
+                               printf("%s: %d\n",
+                                      td_vdi_fields[TD_FIELD_HIDDEN].name,
+                                      hidden);
+               }
+
+               if (depth) {
+                       int ret, length;
+
+                       ret = vhd_chain_depth(&vhd, &length);
+                       if (ret)
+                               printf("error checking chain depth: %d\n", ret);
+                       else
+                               printf("chain depth: %d\n", length);
+
+                       err = (err ? : ret);
+               }
+
+               vhd_close(&vhd);
+
+       } else if (type == TD_TYPE_AIO) {
+               if (size) {
+                       int fd;
+                       uint64_t secs;
+                       uint32_t ssize;
+
+                       fd = open(name, O_RDONLY | O_LARGEFILE);
+                       if (fd == -1) {
+                               printf("failed opening %s: %d\n", name, errno);
+                               return -errno;
+                       }
+
+                       err = tapdisk_get_image_size(fd, &secs, &ssize);
+                       close(fd);
+
+                       if (err) {
+                               printf("failed getting size for %s: %d\n:",
+                                      name, err);
+                               return err;
+                       }
+
+                       printf("%"PRIu64"\n", secs >> 11);
+               }
+
+               if (parent)
+                       printf("%s has no parent\n", name);
+
+               if (fields) {
+                       int i;
+
+                       for (i = 0; i < TD_FIELD_INVALID; i++)
+                               printf("%s: 0\n", td_vdi_fields[i].name);
+               }
+       }
+
+       return err;
+
+ usage:
+       fprintf(stderr, "usage: td-util query %s [-h help] [-v virtsize] "
+               "[-p parent] [-f fields]  <FILENAME>\n", td_disk_types[type]);
+       return err;
+}
+
+int
+td_set_field(int type, int argc, char *argv[])
+{
+       int ret, i, c, cargc;
+       struct vdi_field *field;
+       char *name, *value, *cargv[7];
+
+       if (type != TD_TYPE_VHD) {
+               fprintf(stderr, "Cannot set fields of %s images\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       while ((c = getopt(argc, argv, "h")) != -1) {
+               switch(c) {
+               default:
+                       fprintf(stderr, "Unknown option %c\n", (char)c);
+               case 'h':
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 3))
+               goto usage;
+
+       name  = argv[optind++];
+
+       field = get_field(argv[optind]);
+       if (!field || field->id != TD_FIELD_HIDDEN) {
+               fprintf(stderr, "Invalid field %s\n", argv[optind]);
+               goto usage;
+       }
+
+       value = argv[++optind];
+
+       cargc = 0;
+       memset(cargv, 0, sizeof(cargv));
+       cargv[cargc++] = "set";
+       cargv[cargc++] = "-n";
+       cargv[cargc++] = name;
+       cargv[cargc++] = "-f";
+       cargv[cargc++] = field->name;
+       cargv[cargc++] = "-v";
+       cargv[cargc++] = value;
+       return vhd_util_set_field(cargc, cargv);
+
+ usage:
+       fprintf(stderr, "usage: td-util set %s [-h help] "
+               "<FILENAME> <FIELD> <VALUE>\n", td_disk_types[type]);
+       print_field_names();
+       return EINVAL;
+}
+
+int
+main(int argc, char *argv[])
+{
+       char **cargv;
+       struct command *cmd;
+       int cargc, i, type = -1, ret = 0;
+
+#ifdef CORE_DUMP
+       struct rlimit rlim;
+       rlim.rlim_cur = RLIM_INFINITY;
+       rlim.rlim_max = RLIM_INFINITY;
+       if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+               fprintf(stderr, "setrlimit failed: %d\n", errno);
+#endif
+
+       if (argc < 2)
+               help();
+
+       cargc = argc - 1;
+       cmd   = get_command(argv[1]);
+       if (!cmd) {
+               fprintf(stderr, "invalid COMMAND %s\n", argv[1]);
+               help();
+       }
+
+       if (cmd->needs_type) {
+               if (argc < 3) {
+                       fprintf(stderr, "td-util %s requires a TYPE\n",
+                               cmd->name);
+                       print_disk_types();
+                       exit(-1);
+               }
+
+               type = get_driver_type(argv[2]);
+               if (type < 0) {
+                       fprintf(stderr, "invalid TYPE '%s'.\n", argv[2]);
+                       print_disk_types();
+                       exit(-1);
+               }
+               --cargc;
+       }
+
+       cargv = malloc(sizeof(char *) * cargc);
+       if (!cargv)
+               exit(ENOMEM);
+
+       cargv[0] = cmd->name;
+       for (i = 1; i < cargc; i++)
+               cargv[i] = argv[i + (argc - cargc)];
+
+       switch(cmd->id) {
+       case TD_CMD_CREATE:
+               ret = td_create(type, cargc, cargv);
+               break;
+       case TD_CMD_SNAPSHOT:
+               ret = td_snapshot(type, cargc, cargv);
+               break;
+/*
+       case TD_CMD_COALESCE:
+               ret = td_coalesce(type, cargc, cargv);
+               break;
+*/
+       case TD_CMD_QUERY:
+               ret = td_query(type, cargc, cargv);
+               break;
+/*
+       case TD_CMD_RESIZE:
+               ret = td_resize(type, cargc, cargv);
+               break;
+*/
+       case TD_CMD_SET:
+               ret = td_set_field(type, cargc, cargv);
+               break;
+/*
+       case TD_CMD_REPAIR:
+               ret = td_repair(type, cargc, cargv);
+               break;
+       case TD_CMD_FILL:
+               ret = td_fill(type, cargc, cargv);
+               break;
+       case TD_CMD_READ:
+               ret = td_read(type, cargc, cargv);
+               break;
+*/
+       default:
+       case TD_CMD_INVALID:
+               ret = EINVAL;
+               break;
+       }
+
+       free(cargv);
+
+       return (ret >= 0 ? ret : -ret);
+}
diff --git a/tools/blktap2/drivers/xmsnap b/tools/blktap2/drivers/xmsnap
new file mode 100644 (file)
index 0000000..f14351b
--- /dev/null
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+usage () { echo "USAGE: xmsnap <VM ID> <Backing File>"; }
+
+#
+# Check Usage
+#
+if [ -n "$1" ]
+then
+       vmid=$1
+else
+       usage
+       exit 1
+fi
+
+if [ -n "$2" ]
+then
+       target=$2
+else
+       usage
+       exit 1
+fi
+
+if [ -e "$target" ]
+then
+    echo "Creating snapshot of file $target for VM $vmid."
+else
+    usage
+    echo "File $target not found."
+    exit 1
+fi
+
+#
+# Find the snapshot name
+#
+directory=`dirname "$target"`
+target=`basename "$target"`
+
+let maxidx=0
+if [ -e $directory/${target}.snap1 ]
+then
+       for idx in $(ls $directory/${target}.snap*)
+       do
+           let idx=${idx#$directory/${target}.snap}
+           if [ "$idx" -gt "$maxidx" ]
+           then
+               let maxidx=$idx
+           fi
+       done
+fi
+
+snap=${target}.snap`expr $maxidx + 1`
+
+#
+# Pause VM
+#
+xm pause $vmid
+if [ "$?" -ne "0" ]; then
+  exit 1
+fi
+
+
+#
+# Snap and reposition the files
+#
+mv $directory/$target $directory/$snap
+if [ "$?" -ne "0" ]; then
+  exit 1
+fi
+
+qcow-create 0 $directory/$target $directory/$snap
+
+#
+# Unpause
+#
+xm unpause $vmid
+
+exit
\ No newline at end of file
diff --git a/tools/blktap2/include/Makefile b/tools/blktap2/include/Makefile
new file mode 100644 (file)
index 0000000..66e8a1e
--- /dev/null
@@ -0,0 +1,17 @@
+XEN_ROOT := $(CURDIR)/../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+.PHONY: all
+all:
+
+.PHONY: install
+install:
+       $(INSTALL_DIR) -p $(DESTDIR)$(includedir)
+
+
+.PHONY: clean
+clean:
+       @:
+
+.PHONY: distclean
+distclean: clean
diff --git a/tools/blktap2/include/atomicio.h b/tools/blktap2/include/atomicio.h
new file mode 100644 (file)
index 0000000..7eccf20
--- /dev/null
@@ -0,0 +1,33 @@
+/*     $OpenBSD: atomicio.h,v 1.6 2005/05/24 17:32:43 avsm Exp $       */
+
+/*
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t atomicio(ssize_t (*)(int, void *, size_t), int, void *, size_t);
+
+#define vwrite (ssize_t (*)(int, void *, size_t))write
diff --git a/tools/blktap2/include/blktap2.h b/tools/blktap2/include/blktap2.h
new file mode 100644 (file)
index 0000000..bf923fc
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _BLKTAP_2_H_
+#define _BLKTAP_2_H_
+
+#define BLKTAP2_MAX_MESSAGE_LEN        256
+
+#define BLKTAP2_RING_MESSAGE_PAUSE     1
+#define BLKTAP2_RING_MESSAGE_RESUME    2
+#define BLKTAP2_RING_MESSAGE_CLOSE     3
+
+#define BLKTAP2_IOCTL_KICK_FE          1
+#define BLKTAP2_IOCTL_ALLOC_TAP        200
+#define BLKTAP2_IOCTL_FREE_TAP         201
+#define BLKTAP2_IOCTL_CREATE_DEVICE    202
+#define BLKTAP2_IOCTL_SET_PARAMS       203
+#define BLKTAP2_IOCTL_PAUSE            204
+#define BLKTAP2_IOCTL_REOPEN           205
+#define BLKTAP2_IOCTL_RESUME           206
+
+#define BLKTAP2_SYSFS_DIR              "/sys/class/blktap2"
+#define BLKTAP2_CONTROL_NAME           "blktap-control"
+#define BLKTAP2_CONTROL_DIR            "/var/run/"BLKTAP2_CONTROL_NAME
+#define BLKTAP2_CONTROL_SOCKET         "ctl"
+#define BLKTAP2_DIRECTORY              "/dev/xen/blktap-2"
+#define BLKTAP2_CONTROL_DEVICE         BLKTAP2_DIRECTORY"/control"
+#define BLKTAP2_RING_DEVICE            BLKTAP2_DIRECTORY"/blktap"
+#define BLKTAP2_IO_DEVICE              BLKTAP2_DIRECTORY"/tapdev"
+
+struct blktap2_handle {
+       unsigned int                   ring;
+       unsigned int                   device;
+       unsigned int                   minor;
+};
+
+struct blktap2_params {
+       char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+       unsigned long long             capacity;
+       unsigned long                  sector_size;
+};
+
+#endif
diff --git a/tools/blktap2/include/blktaplib.h b/tools/blktap2/include/blktaplib.h
new file mode 100644 (file)
index 0000000..c2860b7
--- /dev/null
@@ -0,0 +1,242 @@
+/* blktaplib.h
+ *
+ * Blktap library userspace code.
+ *
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __BLKTAPLIB_H__
+#define __BLKTAPLIB_H__
+
+#include <syslog.h>
+#include <sys/time.h>
+#include <xenctrl.h>
+#include <xen/io/blkif.h>
+
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, XC_PAGE_SIZE)
+
+/* size of the extra VMA area to map in attached pages. */
+#define BLKTAP_VMA_PAGES BLK_RING_SIZE
+
+/* blktap IOCTLs: These must correspond with the blktap driver ioctls */
+#define BLKTAP_IOCTL_KICK_FE         1
+#define BLKTAP_IOCTL_KICK_BE         2
+#define BLKTAP_IOCTL_SETMODE         3
+#define BLKTAP_IOCTL_SENDPID        4
+#define BLKTAP_IOCTL_NEWINTF        5
+#define BLKTAP_IOCTL_MINOR          6
+#define BLKTAP_IOCTL_MAJOR          7
+#define BLKTAP_QUERY_ALLOC_REQS      8
+#define BLKTAP_IOCTL_FREEINTF       9
+#define BLKTAP_IOCTL_PRINT_IDXS      100 
+#define BLKTAP_IOCTL_BACKDEV_SETUP   200
+
+#define PRIO_SPECIAL_IO             -9999 
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
+#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
+#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE     0x00000002
+
+#define BLKTAP_MODE_INTERPOSE \
+           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+       return (
+               ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
+               ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+               ( arg == BLKTAP_MODE_INTERPOSE    ) );
+}
+
+#define MAX_REQUESTS            BLK_RING_SIZE
+
+#define BLKTAP_IOCTL_KICK       1
+#define MAX_PENDING_REQS       BLK_RING_SIZE
+#define BLKTAP_DEV_DIR          "/dev/xen"
+#define BLKTAP_DEV_NAME         "blktap"
+#define BACKDEV_NAME            "backdev"
+#define BLKTAP_DEV_MINOR        0
+#define BLKTAP_CTRL_DIR         "/var/run/tap"
+
+extern int blktap_major;
+
+#define BLKTAP_RING_PAGES       1 /* Front */
+#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES)
+
+struct blkif;
+struct blkif_info;
+
+typedef struct {
+       blkif_request_t  req;
+       int              submitting;
+       int              secs_pending;
+        int16_t          status;
+       int              num_retries;
+       struct timeval   last_try;
+} pending_req_t;
+
+typedef struct blkif {
+       domid_t domid;
+       long int handle;
+       
+       long int pdev;
+       long int readonly;
+       
+       enum { DISCONNECTED, DISCONNECTING, CONNECTED } state;
+       
+       struct blkif_ops *ops;
+       struct blkif *hash_next;
+       
+       void *prv;  /* device-specific data */
+       struct blkif_info *info; /*Image parameter passing */
+       pending_req_t pending_list[MAX_REQUESTS];
+       int devnum;
+       int fds[2];
+       int be_id;
+       char *backend_path;
+       int major;
+       int minor;
+       pid_t tappid;
+       int drivertype;
+       uint16_t cookie;
+       int err;
+} blkif_t;
+
+typedef struct blkif_info {
+       char *params;
+       int   readonly;
+       int   storage;
+} blkif_info_t;
+
+typedef struct tapdev_info {
+       int fd;
+       char *mem;
+       blkif_sring_t *sring;
+       blkif_back_ring_t  fe_ring;
+       unsigned long vstart;
+       blkif_t *blkif;
+} tapdev_info_t;
+
+typedef struct domid_translate {
+       unsigned short domid;
+       unsigned short busid;
+} domid_translate_t ;
+
+typedef struct image {
+       unsigned long long size;
+       unsigned long secsize;
+       unsigned int info;
+} image_t;
+
+typedef struct msg_hdr {
+       uint16_t   type;
+       uint16_t   len;
+       uint16_t   drivertype;
+       uint16_t   cookie;
+} msg_hdr_t;
+
+typedef struct msg_params {
+       uint8_t    readonly;
+       int        path_off;
+       int        path_len;
+       int        storage;
+} msg_params_t;
+
+typedef struct msg_newdev {
+       uint8_t     devnum;
+       uint16_t    domid;
+} msg_newdev_t;
+
+typedef struct msg_pid {
+       pid_t     pid;
+} msg_pid_t;
+
+typedef struct msg_cp {
+       int       cp_uuid_off;
+       int       cp_uuid_len;
+       int       cp_drivertype;
+} msg_cp_t;
+
+typedef struct msg_lock {
+       int       ro;
+       int       enforce;
+       int       uuid_off;
+       int       uuid_len;
+} msg_lock_t;
+
+#define READ 0
+#define WRITE 1
+
+/*Control Messages between manager and tapdev*/
+#define CTLMSG_PARAMS          1
+#define CTLMSG_IMG             2
+#define CTLMSG_IMG_FAIL        3
+#define CTLMSG_NEWDEV          4
+#define CTLMSG_NEWDEV_RSP      5
+#define CTLMSG_NEWDEV_FAIL     6
+#define CTLMSG_CLOSE           7
+#define CTLMSG_CLOSE_RSP       8
+#define CTLMSG_PID             9
+#define CTLMSG_PID_RSP         10
+#define CTLMSG_CHECKPOINT      11
+#define CTLMSG_CHECKPOINT_RSP  12
+#define CTLMSG_LOCK            13
+#define CTLMSG_LOCK_RSP        14
+#define CTLMSG_PAUSE           15
+#define CTLMSG_PAUSE_RSP       16
+#define CTLMSG_RESUME          17
+#define CTLMSG_RESUME_RSP      18
+
+#define TAPDISK_STORAGE_TYPE_NFS       1
+#define TAPDISK_STORAGE_TYPE_EXT       2
+#define TAPDISK_STORAGE_TYPE_LVM       3
+#define TAPDISK_STORAGE_TYPE_DEFAULT   TAPDISK_STORAGE_TYPE_EXT
+
+/* Abitrary values, must match the underlying driver... */
+#define MAX_TAP_DEV 256
+
+/* Accessing attached data page mappings */
+#define MMAP_PAGES                                                    \
+    (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_vstart,_req,_seg)                                 \
+    ((_vstart) +                                                      \
+     ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) +      \
+     ((_seg) * getpagesize()))
+
+/* Defines that are only used by library clients */
+
+#ifndef __COMPILING_BLKTAP_LIB
+
+static char *blkif_op_name[] = {
+       [BLKIF_OP_READ]       = "READ",
+       [BLKIF_OP_WRITE]      = "WRITE",
+};
+
+#endif /* __COMPILING_BLKTAP_LIB */
+
+#endif /* __BLKTAPLIB_H__ */
diff --git a/tools/blktap2/include/libvhd-journal.h b/tools/blktap2/include/libvhd-journal.h
new file mode 100644 (file)
index 0000000..78e45a2
--- /dev/null
@@ -0,0 +1,68 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_JOURNAL_H_
+#define _VHD_JOURNAL_H_
+
+#include <inttypes.h>
+
+#include "libvhd.h"
+
+#define VHD_JOURNAL_METADATA       0x01
+#define VHD_JOURNAL_DATA           0x02
+
+#define VHD_JOURNAL_HEADER_COOKIE  "vjournal"
+#define VHD_JOURNAL_ENTRY_COOKIE   0xaaaa12344321aaaa
+
+typedef struct vhd_journal_header {
+       char                       cookie[8];
+       vhd_uuid_t                 uuid;
+       uint64_t                   vhd_footer_offset;
+       uint32_t                   journal_data_entries;
+       uint32_t                   journal_metadata_entries;
+       uint64_t                   journal_data_offset;
+       uint64_t                   journal_metadata_offset;
+       uint64_t                   journal_eof;
+       char                       pad[448];
+} vhd_journal_header_t;
+
+typedef struct vhd_journal {
+       char                      *jname;
+       int                        jfd;
+       int                        is_block; /* is jfd a block device */
+       vhd_journal_header_t       header;
+       vhd_context_t              vhd;
+} vhd_journal_t;
+
+int vhd_journal_create(vhd_journal_t *, const char *file, const char *jfile);
+int vhd_journal_open(vhd_journal_t *, const char *file, const char *jfile);
+int vhd_journal_add_block(vhd_journal_t *, uint32_t block, char mode);
+int vhd_journal_commit(vhd_journal_t *);
+int vhd_journal_revert(vhd_journal_t *);
+int vhd_journal_close(vhd_journal_t *);
+int vhd_journal_remove(vhd_journal_t *);
+
+#endif
diff --git a/tools/blktap2/include/libvhd.h b/tools/blktap2/include/libvhd.h
new file mode 100644 (file)
index 0000000..8e854e4
--- /dev/null
@@ -0,0 +1,326 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_LIB_H_
+#define _VHD_LIB_H_
+
+#include <string.h>
+#if defined(__linux__)
+#include <endian.h>
+#include <byteswap.h>
+#elif defined(__NetBSD__)
+#include <sys/endian.h>
+#include <sys/bswap.h>
+#endif
+
+#include "vhd-uuid.h"
+#include "vhd.h"
+
+#ifndef O_LARGEFILE
+#define O_LARGEFILE    0
+#endif
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#if defined(__linux__)
+  #define BE16_IN(foo)             (*(foo)) = bswap_16(*(foo))
+  #define BE32_IN(foo)             (*(foo)) = bswap_32(*(foo))
+  #define BE64_IN(foo)             (*(foo)) = bswap_64(*(foo))
+  #define BE16_OUT(foo)            (*(foo)) = bswap_16(*(foo))
+  #define BE32_OUT(foo)            (*(foo)) = bswap_32(*(foo))
+  #define BE64_OUT(foo)            (*(foo)) = bswap_64(*(foo))
+#elif defined(__NetBSD__)
+  #define BE16_IN(foo)             (*(foo)) = bswap16(*(foo))
+  #define BE32_IN(foo)             (*(foo)) = bswap32(*(foo))
+  #define BE64_IN(foo)             (*(foo)) = bswap64(*(foo))
+  #define BE16_OUT(foo)            (*(foo)) = bswap16(*(foo))
+  #define BE32_OUT(foo)            (*(foo)) = bswap32(*(foo))
+  #define BE64_OUT(foo)            (*(foo)) = bswap64(*(foo))
+#endif
+#else
+  #define BE16_IN(foo)
+  #define BE32_IN(foo)
+  #define BE64_IN(foo)
+  #define BE32_OUT(foo)
+  #define BE32_OUT(foo)
+  #define BE64_OUT(foo)
+#endif
+
+#define MIN(a, b)                  (((a) < (b)) ? (a) : (b))
+#define MAX(a, b)                  (((a) > (b)) ? (a) : (b))
+
+#define VHD_MAX_NAME_LEN           1024
+
+#define VHD_BLOCK_SHIFT            21
+#define VHD_BLOCK_SIZE             (1ULL << VHD_BLOCK_SHIFT)
+
+#define UTF_16                     "UTF-16"
+#define UTF_16LE                   "UTF-16LE"
+#define UTF_16BE                   "UTF-16BE"
+
+#define VHD_OPEN_RDONLY            0x00001
+#define VHD_OPEN_RDWR              0x00002
+#define VHD_OPEN_FAST              0x00004
+#define VHD_OPEN_STRICT            0x00008
+#define VHD_OPEN_IGNORE_DISABLED   0x00010
+
+#define VHD_FLAG_CREAT_PARENT_RAW        0x00001
+
+#define vhd_flag_set(word, flag)         ((word) |= (flag))
+#define vhd_flag_clear(word, flag)       ((word) &= ~(flag))
+#define vhd_flag_test(word, flag)        ((word) & (flag))
+
+
+#define ENABLE_FAILURE_TESTING
+#define FAIL_REPARENT_BEGIN        0
+#define FAIL_REPARENT_LOCATOR      1
+#define FAIL_REPARENT_END          2
+#define FAIL_RESIZE_BEGIN          3
+#define FAIL_RESIZE_DATA_MOVED     4
+#define FAIL_RESIZE_METADATA_MOVED 5
+#define FAIL_RESIZE_END            6
+#define NUM_FAIL_TESTS             7
+
+#ifdef ENABLE_FAILURE_TESTING
+#define TEST_FAIL_AT(point) \
+       if (TEST_FAIL[point]) { \
+               printf("Failing at %s\n", ENV_VAR_FAIL[point]); exit(EINVAL); }
+#define TEST_FAIL_EXTERN_VARS              \
+       extern const char* ENV_VAR_FAIL[]; \
+       extern int TEST_FAIL[];
+#else
+#define TEST_FAIL_AT(point)
+#define TEST_FAIL_EXTERN_VARS
+#endif // ENABLE_FAILURE_TESTING
+
+
+static const char                  VHD_POISON_COOKIE[] = "v_poison";
+
+typedef struct hd_ftr              vhd_footer_t;
+typedef struct dd_hdr              vhd_header_t;
+typedef struct vhd_bat             vhd_bat_t;
+typedef struct vhd_batmap          vhd_batmap_t;
+typedef struct dd_batmap_hdr       vhd_batmap_header_t;
+typedef struct prt_loc             vhd_parent_locator_t;
+typedef struct vhd_context         vhd_context_t;
+typedef uint32_t                   vhd_flag_creat_t;
+
+struct vhd_bat {
+       uint32_t                   spb;
+       uint32_t                   entries;
+       uint32_t                  *bat;
+};
+
+struct vhd_batmap {
+       vhd_batmap_header_t        header;
+       char                      *map;
+};
+
+struct vhd_context {
+       int                        fd;
+       char                      *file;
+       int                        oflags;
+       int                        is_block;
+
+       uint32_t                   spb;
+       uint32_t                   bm_secs;
+
+       vhd_header_t               header;
+       vhd_footer_t               footer;
+       vhd_bat_t                  bat;
+       vhd_batmap_t               batmap;
+};
+
+static inline uint32_t
+secs_round_up(uint64_t bytes)
+{
+       return ((bytes + (VHD_SECTOR_SIZE - 1)) >> VHD_SECTOR_SHIFT);
+}
+
+static inline uint32_t
+secs_round_up_no_zero(uint64_t bytes)
+{
+       return (secs_round_up(bytes) ? : 1);
+}
+
+static inline uint64_t
+vhd_sectors_to_bytes(uint64_t sectors)
+{
+       return sectors << VHD_SECTOR_SHIFT;
+}
+
+static inline uint64_t
+vhd_bytes_padded(uint64_t bytes)
+{
+       return vhd_sectors_to_bytes(secs_round_up_no_zero(bytes));
+}
+
+static inline int
+vhd_type_dynamic(vhd_context_t *ctx)
+{
+       return (ctx->footer.type == HD_TYPE_DYNAMIC ||
+               ctx->footer.type == HD_TYPE_DIFF);
+}
+
+static inline int
+vhd_creator_tapdisk(vhd_context_t *ctx)
+{
+       return !strncmp(ctx->footer.crtr_app, "tap", 3);
+}
+
+static inline int
+vhd_disabled(vhd_context_t *ctx)
+{
+       return (!memcmp(ctx->footer.cookie,
+                       VHD_POISON_COOKIE, sizeof(ctx->footer.cookie)));
+}
+
+static inline size_t
+vhd_parent_locator_size(vhd_parent_locator_t *loc)
+{
+       /*
+        * MICROSOFT_COMPAT
+        * data_space *should* be in sectors,
+        * but sometimes we find it in bytes
+        */
+       if (loc->data_space < 512)
+               return vhd_sectors_to_bytes(loc->data_space);
+       else if (loc->data_space % 512 == 0)
+               return loc->data_space;
+       else
+               return 0;
+}
+
+static inline int
+vhd_parent_raw(vhd_context_t *ctx)
+{
+       return vhd_uuid_is_nil(&ctx->header.prt_uuid);
+}
+
+void libvhd_set_log_level(int);
+
+int vhd_test_file_fixed(const char *, int *);
+
+uint32_t vhd_time(time_t time);
+size_t vhd_time_to_string(uint32_t timestamp, char *target);
+uint32_t vhd_chs(uint64_t size);
+
+uint32_t vhd_checksum_footer(vhd_footer_t *);
+uint32_t vhd_checksum_header(vhd_header_t *);
+uint32_t vhd_checksum_batmap(vhd_batmap_t *);
+
+void vhd_footer_in(vhd_footer_t *);
+void vhd_footer_out(vhd_footer_t *);
+void vhd_header_in(vhd_header_t *);
+void vhd_header_out(vhd_header_t *);
+void vhd_bat_in(vhd_bat_t *);
+void vhd_bat_out(vhd_bat_t *);
+void vhd_batmap_header_in(vhd_batmap_t *);
+void vhd_batmap_header_out(vhd_batmap_t *);
+
+int vhd_validate_footer(vhd_footer_t *footer);
+int vhd_validate_header(vhd_header_t *header);
+int vhd_validate_batmap_header(vhd_batmap_t *batmap);
+int vhd_validate_batmap(vhd_batmap_t *batmap);
+int vhd_validate_platform_code(uint32_t code);
+
+int vhd_open(vhd_context_t *, const char *file, int flags);
+void vhd_close(vhd_context_t *);
+int vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t);
+/* vhd_snapshot: the bytes parameter is optional and can be 0 if the snapshot 
+ * is to have the same size as the (first non-empty) parent */
+int vhd_snapshot(const char *snapshot, uint64_t bytes, const char *parent,
+               vhd_flag_creat_t);
+
+int vhd_hidden(vhd_context_t *, int *);
+int vhd_chain_depth(vhd_context_t *, int *);
+
+off_t vhd_position(vhd_context_t *);
+int vhd_seek(vhd_context_t *, off_t, int);
+int vhd_read(vhd_context_t *, void *, size_t);
+int vhd_write(vhd_context_t *, void *, size_t);
+
+int vhd_offset(vhd_context_t *, uint32_t, uint32_t *);
+
+int vhd_end_of_headers(vhd_context_t *ctx, off_t *off);
+int vhd_end_of_data(vhd_context_t *ctx, off_t *off);
+int vhd_batmap_header_offset(vhd_context_t *ctx, off_t *off);
+
+int vhd_get_header(vhd_context_t *);
+int vhd_get_footer(vhd_context_t *);
+int vhd_get_bat(vhd_context_t *);
+int vhd_get_batmap(vhd_context_t *);
+
+void vhd_put_header(vhd_context_t *);
+void vhd_put_footer(vhd_context_t *);
+void vhd_put_bat(vhd_context_t *);
+void vhd_put_batmap(vhd_context_t *);
+
+int vhd_has_batmap(vhd_context_t *);
+int vhd_batmap_test(vhd_context_t *, vhd_batmap_t *, uint32_t);
+void vhd_batmap_set(vhd_context_t *, vhd_batmap_t *, uint32_t);
+void vhd_batmap_clear(vhd_context_t *, vhd_batmap_t *, uint32_t);
+
+int vhd_get_phys_size(vhd_context_t *, off_t *);
+int vhd_set_phys_size(vhd_context_t *, off_t);
+
+int vhd_bitmap_test(vhd_context_t *, char *, uint32_t);
+void vhd_bitmap_set(vhd_context_t *, char *, uint32_t);
+void vhd_bitmap_clear(vhd_context_t *, char *, uint32_t);
+
+int vhd_parent_locator_count(vhd_context_t *);
+int vhd_parent_locator_get(vhd_context_t *, char **);
+int vhd_parent_locator_read(vhd_context_t *, vhd_parent_locator_t *, char **);
+int vhd_find_parent(vhd_context_t *, const char *, char **);
+int vhd_parent_locator_write_at(vhd_context_t *, const char *,
+                               off_t, uint32_t, size_t,
+                               vhd_parent_locator_t *);
+
+int vhd_header_decode_parent(vhd_context_t *, vhd_header_t *, char **);
+int vhd_change_parent(vhd_context_t *, char *parent_path, int raw);
+
+int vhd_read_footer(vhd_context_t *, vhd_footer_t *);
+int vhd_read_footer_at(vhd_context_t *, vhd_footer_t *, off_t);
+int vhd_read_footer_strict(vhd_context_t *, vhd_footer_t *);
+int vhd_read_header(vhd_context_t *, vhd_header_t *);
+int vhd_read_header_at(vhd_context_t *, vhd_header_t *, off_t);
+int vhd_read_bat(vhd_context_t *, vhd_bat_t *);
+int vhd_read_batmap(vhd_context_t *, vhd_batmap_t *);
+int vhd_read_bitmap(vhd_context_t *, uint32_t block, char **bufp);
+int vhd_read_block(vhd_context_t *, uint32_t block, char **bufp);
+
+int vhd_write_footer(vhd_context_t *, vhd_footer_t *);
+int vhd_write_footer_at(vhd_context_t *, vhd_footer_t *, off_t);
+int vhd_write_header(vhd_context_t *, vhd_header_t *);
+int vhd_write_header_at(vhd_context_t *, vhd_header_t *, off_t);
+int vhd_write_bat(vhd_context_t *, vhd_bat_t *);
+int vhd_write_batmap(vhd_context_t *, vhd_batmap_t *);
+int vhd_write_bitmap(vhd_context_t *, uint32_t block, char *bitmap);
+int vhd_write_block(vhd_context_t *, uint32_t block, char *data);
+
+int vhd_io_read(vhd_context_t *, char *, uint64_t, uint32_t);
+int vhd_io_write(vhd_context_t *, char *, uint64_t, uint32_t);
+
+#endif
diff --git a/tools/blktap2/include/list.h b/tools/blktap2/include/list.h
new file mode 100644 (file)
index 0000000..cbd0050
--- /dev/null
@@ -0,0 +1,125 @@
+/*
+ * list.h
+ * 
+ * This is a subset of linux's list.h intended to be used in user-space.
+ * XXX The namespace conflicts with NetBSD's <sys/queue.h>
+ * 
+ */
+
+#ifndef __LIST_H__
+#define __LIST_H__
+
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+struct list_head {
+        struct list_head *next, *prev;
+};
+/* XXX workaround for conflicts. The list API should use its own
+ * namespace prefix, i.e. BLK_
+ */
+#ifdef LIST_HEAD_INIT
+#undef LIST_HEAD_INIT
+#endif
+#ifndef LIST_HEAD
+#undef LIST_HEAD
+#endif
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+#define LIST_HEAD(name) \
+        struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+       list->next = list;
+       list->prev = list;
+}
+
+static inline void __list_add(struct list_head *new,
+                              struct list_head *prev,
+                              struct list_head *next)
+{
+        next->prev = new;
+        new->next = next;
+        new->prev = prev;
+        prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+        __list_add(new, head, head->next);
+}
+
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head->prev, head);
+}
+
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+        next->prev = prev;
+        prev->next = next;
+}
+
+static inline void list_del(struct list_head *entry)
+{
+        __list_del(entry->prev, entry->next);
+        entry->next = LIST_POISON1;
+        entry->prev = LIST_POISON2;
+}
+
+static inline void list_del_init(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       INIT_LIST_HEAD(entry);
+}
+
+static inline int list_empty(const struct list_head *head)
+{
+       return head->next == head;
+}
+
+static inline int list_is_last(const struct list_head *list,
+                              const struct list_head *head)
+{
+       return list->next == head;
+}
+
+static inline void __list_splice(const struct list_head *list,
+                                struct list_head *prev,
+                                struct list_head *next)
+{
+       struct list_head *first = list->next;
+       struct list_head *last = list->prev;
+
+       first->prev = prev;
+       prev->next = first;
+
+       last->next = next;
+       next->prev = last;
+}
+
+static inline void list_splice(const struct list_head *list,
+                              struct list_head *head)
+{
+       if (!list_empty(list))
+               __list_splice(list, head, head->next);
+}
+
+#define list_entry(ptr, type, member)                                   \
+        ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+#define list_for_each_entry(pos, head, member)                          \
+        for (pos = list_entry((head)->next, typeof(*pos), member);      \
+             &pos->member != (head);                                    \
+             pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_safe(pos, n, head, member)                 \
+       for (pos = list_entry((head)->next, typeof(*pos), member),      \
+              n = list_entry(pos->member.next, typeof(*pos), member);  \
+            &pos->member != (head);                                    \
+            pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#endif /* __LIST_H__ */
diff --git a/tools/blktap2/include/lvm-util.h b/tools/blktap2/include/lvm-util.h
new file mode 100644 (file)
index 0000000..95f3320
--- /dev/null
@@ -0,0 +1,71 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LVM_UTIL_H_
+#define _LVM_UTIL_H_
+
+#include <inttypes.h>
+
+#define MAX_NAME_SIZE            256
+
+#define LVM_SEG_TYPE_LINEAR      1
+#define LVM_SEG_TYPE_UNKNOWN     2
+
+struct lv_segment {
+       uint8_t                  type;
+       char                     device[MAX_NAME_SIZE];
+       uint64_t                 pe_start;
+       uint64_t                 pe_size;
+};
+
+struct lv {
+       char                     name[MAX_NAME_SIZE];
+       uint64_t                 size;
+       uint32_t                 segments;
+       struct lv_segment        first_segment;
+};
+
+struct pv {
+       char                     name[MAX_NAME_SIZE];
+       uint64_t                 start;
+};
+
+struct vg {
+       char                     name[MAX_NAME_SIZE];
+       uint64_t                 extent_size;
+
+       int                      pv_cnt;
+       struct pv               *pvs;
+
+       int                      lv_cnt;
+       struct lv               *lvs;
+};
+
+int lvm_scan_vg(const char *vg_name, struct vg *vg);
+void lvm_free_vg(struct vg *vg);
+
+#endif
diff --git a/tools/blktap2/include/relative-path.h b/tools/blktap2/include/relative-path.h
new file mode 100644 (file)
index 0000000..d78f94d
--- /dev/null
@@ -0,0 +1,43 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _RELATIVE_PATH_H_
+#define _RELATIVE_PATH_H_
+
+#include <syslog.h>
+
+#define DELIMITER    '/'
+#define MAX_NAME_LEN 1000
+
+#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a)
+
+/*
+ * returns a relative path from @src to @dest
+ * result should be freed
+ */
+char *relative_path_to(char *src, char *dest, int *err);
+
+#endif
diff --git a/tools/blktap2/include/tapdisk-message.h b/tools/blktap2/include/tapdisk-message.h
new file mode 100644 (file)
index 0000000..63a15a2
--- /dev/null
@@ -0,0 +1,203 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_MESSAGE_H_
+#define _TAPDISK_MESSAGE_H_
+
+#include <inttypes.h>
+#include <sys/types.h>
+
+#define TAPDISK_MESSAGE_MAX_PATH_LENGTH  256
+#define TAPDISK_MESSAGE_STRING_LENGTH    256
+
+#define TAPDISK_MESSAGE_MAX_MINORS \
+       ((TAPDISK_MESSAGE_MAX_PATH_LENGTH / sizeof(int)) - 1)
+
+#define TAPDISK_MESSAGE_FLAG_SHARED      0x01
+#define TAPDISK_MESSAGE_FLAG_RDONLY      0x02
+#define TAPDISK_MESSAGE_FLAG_ADD_CACHE   0x04
+#define TAPDISK_MESSAGE_FLAG_VHD_INDEX   0x08
+#define TAPDISK_MESSAGE_FLAG_LOG_DIRTY   0x10
+
+typedef struct tapdisk_message           tapdisk_message_t;
+typedef uint8_t                          tapdisk_message_flag_t;
+typedef struct tapdisk_message_image     tapdisk_message_image_t;
+typedef struct tapdisk_message_params    tapdisk_message_params_t;
+typedef struct tapdisk_message_string    tapdisk_message_string_t;
+typedef struct tapdisk_message_response  tapdisk_message_response_t;
+typedef struct tapdisk_message_minors    tapdisk_message_minors_t;
+typedef struct tapdisk_message_list      tapdisk_message_list_t;
+
+struct tapdisk_message_params {
+       tapdisk_message_flag_t           flags;
+
+       uint8_t                          storage;
+       uint32_t                         devnum;
+       uint32_t                         domid;
+       uint16_t                         path_len;
+       char                             path[TAPDISK_MESSAGE_MAX_PATH_LENGTH];
+};
+
+struct tapdisk_message_image {
+       uint64_t                         sectors;
+       uint32_t                         sector_size;
+       uint32_t                         info;
+};
+
+struct tapdisk_message_string {
+       char                             text[TAPDISK_MESSAGE_STRING_LENGTH];
+};
+
+struct tapdisk_message_response {
+       int                              error;
+       char                             message[TAPDISK_MESSAGE_STRING_LENGTH];
+};
+
+struct tapdisk_message_minors {
+       int                              count;
+       int                              list[TAPDISK_MESSAGE_MAX_MINORS];
+};
+
+struct tapdisk_message_list {
+       int                              count;
+       int                              minor;
+       int                              state;
+       char                             path[TAPDISK_MESSAGE_MAX_PATH_LENGTH];
+};
+
+struct tapdisk_message {
+       uint16_t                         type;
+       uint16_t                         cookie;
+
+       union {
+               pid_t                    tapdisk_pid;
+               tapdisk_message_image_t  image;
+               tapdisk_message_params_t params;
+               tapdisk_message_string_t string;
+               tapdisk_message_minors_t minors;
+               tapdisk_message_response_t response;
+               tapdisk_message_list_t   list;
+       } u;
+};
+
+enum tapdisk_message_id {
+       TAPDISK_MESSAGE_ERROR = 1,
+       TAPDISK_MESSAGE_RUNTIME_ERROR,
+       TAPDISK_MESSAGE_PID,
+       TAPDISK_MESSAGE_PID_RSP,
+       TAPDISK_MESSAGE_ATTACH,
+       TAPDISK_MESSAGE_ATTACH_RSP,
+       TAPDISK_MESSAGE_OPEN,
+       TAPDISK_MESSAGE_OPEN_RSP,
+       TAPDISK_MESSAGE_PAUSE,
+       TAPDISK_MESSAGE_PAUSE_RSP,
+       TAPDISK_MESSAGE_RESUME,
+       TAPDISK_MESSAGE_RESUME_RSP,
+       TAPDISK_MESSAGE_CLOSE,
+       TAPDISK_MESSAGE_CLOSE_RSP,
+       TAPDISK_MESSAGE_DETACH,
+       TAPDISK_MESSAGE_DETACH_RSP,
+       TAPDISK_MESSAGE_LIST_MINORS,
+       TAPDISK_MESSAGE_LIST_MINORS_RSP,
+       TAPDISK_MESSAGE_LIST,
+       TAPDISK_MESSAGE_LIST_RSP,
+       TAPDISK_MESSAGE_FORCE_SHUTDOWN,
+       TAPDISK_MESSAGE_EXIT,
+};
+
+static inline char *
+tapdisk_message_name(enum tapdisk_message_id id)
+{
+       switch (id) {
+       case TAPDISK_MESSAGE_ERROR:
+               return "error";
+
+       case TAPDISK_MESSAGE_PID:
+               return "pid";
+
+       case TAPDISK_MESSAGE_PID_RSP:
+               return "pid response";
+
+       case TAPDISK_MESSAGE_OPEN:
+               return "open";
+
+       case TAPDISK_MESSAGE_OPEN_RSP:
+               return "open response";
+
+       case TAPDISK_MESSAGE_PAUSE:
+               return "pause";
+
+       case TAPDISK_MESSAGE_PAUSE_RSP:
+               return "pause response";
+
+       case TAPDISK_MESSAGE_RESUME:
+               return "resume";
+
+       case TAPDISK_MESSAGE_RESUME_RSP:
+               return "resume response";
+
+       case TAPDISK_MESSAGE_CLOSE:
+               return "close";
+
+       case TAPDISK_MESSAGE_FORCE_SHUTDOWN:
+               return "force shutdown";
+
+       case TAPDISK_MESSAGE_CLOSE_RSP:
+               return "close response";
+
+       case TAPDISK_MESSAGE_ATTACH:
+               return "attach";
+
+       case TAPDISK_MESSAGE_ATTACH_RSP:
+               return "attach response";
+
+       case TAPDISK_MESSAGE_DETACH:
+               return "detach";
+
+       case TAPDISK_MESSAGE_DETACH_RSP:
+               return "detach response";
+
+       case TAPDISK_MESSAGE_LIST_MINORS:
+               return "list minors";
+
+       case TAPDISK_MESSAGE_LIST_MINORS_RSP:
+               return "list minors response";
+
+       case TAPDISK_MESSAGE_LIST:
+               return "list";
+
+       case TAPDISK_MESSAGE_LIST_RSP:
+               return "list response";
+
+       case TAPDISK_MESSAGE_EXIT:
+               return "exit";
+
+       default:
+               return "unknown";
+       }
+}
+
+#endif
diff --git a/tools/blktap2/include/vhd-util.h b/tools/blktap2/include/vhd-util.h
new file mode 100644 (file)
index 0000000..11f077e
--- /dev/null
@@ -0,0 +1,44 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_UTIL_H_
+#define _VHD_UTIL_H_
+
+int vhd_util_create(int argc, char **argv);
+int vhd_util_snapshot(int argc, char **argv);
+int vhd_util_query(int argc, char **argv);
+int vhd_util_read(int argc, char **argv);
+int vhd_util_set_field(int argc, char **argv);
+int vhd_util_repair(int argc, char **argv);
+int vhd_util_fill(int argc, char **argv);
+int vhd_util_resize(int argc, char **argv);
+int vhd_util_coalesce(int argc, char **argv);
+int vhd_util_modify(int argc, char **argv);
+int vhd_util_scan(int argc, char **argv);
+int vhd_util_check(int argc, char **argv);
+int vhd_util_revert(int argc, char **argv);
+
+#endif
diff --git a/tools/blktap2/include/vhd-uuid.h b/tools/blktap2/include/vhd-uuid.h
new file mode 100644 (file)
index 0000000..1b65b67
--- /dev/null
@@ -0,0 +1,63 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __BLKTAP2_VHD_UUID_H__
+#define __BLKTAP2_VHD_UUID_H__
+
+#if defined(__linux__)
+
+#include <uuid/uuid.h>
+typedef struct {
+    uuid_t uuid;
+} vhd_uuid_t;
+
+#elif defined(__NetBSD__)
+
+#include <uuid.h>
+
+typedef uuid_t vhd_uuid_t;
+
+#else
+
+#error "Please update vhd-uuid.h for your OS"
+
+#endif
+
+int vhd_uuid_is_nil(vhd_uuid_t *uuid);
+
+void vhd_uuid_generate(vhd_uuid_t *uuid);
+
+void vhd_uuid_to_string(vhd_uuid_t *uuid, char *out, size_t size);
+
+void vhd_uuid_from_string(vhd_uuid_t *uuid, const char *in);
+
+void vhd_uuid_copy(vhd_uuid_t *dst, vhd_uuid_t *src);
+
+void vhd_uuid_clear(vhd_uuid_t *uuid);
+
+int vhd_uuid_compare(vhd_uuid_t *uuid1, vhd_uuid_t *uuid2);
+
+#endif /* __BLKTAP2_VHD_UUID_H__ */
diff --git a/tools/blktap2/include/vhd.h b/tools/blktap2/include/vhd.h
new file mode 100644 (file)
index 0000000..c064425
--- /dev/null
@@ -0,0 +1,219 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __VHD_H__
+#define __VHD_H__
+
+#include <inttypes.h>
+
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#define DEBUG 1
+
+/* ---------------------------------------------------------------------- */
+/* General definitions.                                                   */
+/* ---------------------------------------------------------------------- */
+
+#define VHD_SECTOR_SIZE  512
+#define VHD_SECTOR_SHIFT   9
+
+/* ---------------------------------------------------------------------- */
+/* This is the generic disk footer, used by all disks.                    */
+/* ---------------------------------------------------------------------- */
+
+struct hd_ftr {
+  char   cookie[8];       /* Identifies original creator of the disk      */
+  u32    features;        /* Feature Support -- see below                 */
+  u32    ff_version;      /* (major,minor) version of disk file           */
+  u64    data_offset;     /* Abs. offset from SOF to next structure       */
+  u32    timestamp;       /* Creation time.  secs since 1/1/2000GMT       */
+  char   crtr_app[4];     /* Creator application                          */
+  u32    crtr_ver;        /* Creator version (major,minor)                */
+  u32    crtr_os;         /* Creator host OS                              */
+  u64    orig_size;       /* Size at creation (bytes)                     */
+  u64    curr_size;       /* Current size of disk (bytes)                 */
+  u32    geometry;        /* Disk geometry                                */
+  u32    type;            /* Disk type                                    */
+  u32    checksum;        /* 1's comp sum of this struct.                 */
+  vhd_uuid_t uuid;        /* Unique disk ID, used for naming parents      */
+  char   saved;           /* one-bit -- is this disk/VM in a saved state? */
+  char   hidden;          /* tapdisk-specific field: is this vdi hidden?  */
+  char   reserved[426];   /* padding                                      */
+};
+
+/* VHD cookie string. */
+static const char HD_COOKIE[9]  =  "conectix";
+
+/* Feature fields in hd_ftr */
+#define HD_NO_FEATURES     0x00000000
+#define HD_TEMPORARY       0x00000001 /* disk can be deleted on shutdown */
+#define HD_RESERVED        0x00000002 /* NOTE: must always be set        */
+
+/* Version field in hd_ftr */
+#define HD_FF_VERSION      0x00010000
+
+/* Known creator OS type fields in hd_ftr.crtr_os */
+#define HD_CR_OS_WINDOWS   0x5769326B /* (Wi2k) */
+#define HD_CR_OS_MACINTOSH 0x4D616320 /* (Mac ) */
+
+/*
+ * version 0.1:  little endian bitmaps
+ * version 1.1:  big endian bitmaps; batmap
+ * version 1.2:  libvhd
+ * version 1.3:  batmap version bump to 1.2
+ */
+#define VHD_VERSION(major, minor)  (((major) << 16) | ((minor) & 0x0000FFFF))
+#define VHD_CURRENT_VERSION        VHD_VERSION(1, 3)
+
+/* Disk geometry accessor macros. */
+/* Geometry is a triple of (cylinders (2 bytes), tracks (1 byte), and 
+ * secotrs-per-track (1 byte)) 
+ */
+#define GEOM_GET_CYLS(_g)  (((_g) >> 16) & 0xffff)
+#define GEOM_GET_HEADS(_g) (((_g) >> 8)  & 0xff)
+#define GEOM_GET_SPT(_g)   ((_g) & 0xff)
+
+#define GEOM_ENCODE(_c, _h, _s) (((_c) << 16) | ((_h) << 8) | (_s))
+
+/* type field in hd_ftr */
+#define HD_TYPE_NONE       0
+#define HD_TYPE_FIXED      2  /* fixed-allocation disk */
+#define HD_TYPE_DYNAMIC    3  /* dynamic disk */
+#define HD_TYPE_DIFF       4  /* differencing disk */
+
+/* String table for hd.type */
+static const char *HD_TYPE_STR[7] = {
+        "None",                    /* 0 */
+        "Reserved (deprecated)",   /* 1 */
+        "Fixed hard disk",         /* 2 */
+        "Dynamic hard disk",       /* 3 */
+        "Differencing hard disk",  /* 4 */
+        "Reserved (deprecated)",   /* 5 */
+        "Reserved (deprecated)"    /* 6 */
+};
+
+#define HD_TYPE_MAX 6
+
+struct prt_loc {
+  u32    code;            /* Platform code -- see defines below.          */
+  u32    data_space;      /* Number of 512-byte sectors to store locator  */
+  u32    data_len;        /* Actual length of parent locator in bytes     */
+  u32    res;             /* Must be zero                                 */
+  u64    data_offset;     /* Absolute offset of locator data (bytes)      */
+};
+
+/* Platform Codes */
+#define PLAT_CODE_NONE  0x0
+#define PLAT_CODE_WI2R  0x57693272  /* deprecated                         */
+#define PLAT_CODE_WI2K  0x5769326B  /* deprecated                         */
+#define PLAT_CODE_W2RU  0x57327275  /* Windows relative path (UTF-16)     */
+#define PLAT_CODE_W2KU  0x57326B75  /* Windows absolute path (UTF-16)     */
+#define PLAT_CODE_MAC   0x4D616320  /* MacOS alias stored as a blob.      */
+#define PLAT_CODE_MACX  0x4D616358  /* File URL (UTF-8), see RFC 2396.    */
+
+/* ---------------------------------------------------------------------- */
+/* This is the dynamic disk header.                                       */
+/* ---------------------------------------------------------------------- */
+
+struct dd_hdr {
+  char   cookie[8];       /* Should contain "cxsparse"                    */
+  u64    data_offset;     /* Byte offset of next record. (Unused) 0xffs   */
+  u64    table_offset;    /* Absolute offset to the BAT.                  */
+  u32    hdr_ver;         /* Version of the dd_hdr (major,minor)          */
+  u32    max_bat_size;    /* Maximum number of entries in the BAT         */
+  u32    block_size;      /* Block size in bytes. Must be power of 2.     */
+  u32    checksum;        /* Header checksum.  1's comp of all fields.    */
+  vhd_uuid_t prt_uuid;    /* ID of the parent disk.                       */
+  u32    prt_ts;          /* Modification time of the parent disk         */
+  u32    res1;            /* Reserved.                                    */
+  char   prt_name[512];   /* Parent unicode name.                         */
+  struct prt_loc loc[8];  /* Parent locator entries.                      */
+  char   res2[256];       /* Reserved.                                    */
+};
+
+/* VHD cookie string. */
+static const char DD_COOKIE[9]  =  "cxsparse";
+
+/* Version field in hd_ftr */
+#define DD_VERSION 0x00010000
+
+/* Default blocksize is 2 meg. */
+#define DD_BLOCKSIZE_DEFAULT 0x00200000
+
+#define DD_BLK_UNUSED 0xFFFFFFFF
+
+struct dd_batmap_hdr {
+  char   cookie[8];       /* should contain "tdbatmap"                    */
+  u64    batmap_offset;   /* byte offset to batmap                        */
+  u32    batmap_size;     /* batmap size in sectors                       */
+  u32    batmap_version;  /* version of batmap                            */
+  u32    checksum;        /* batmap checksum -- 1's complement of batmap  */
+};
+
+static const char VHD_BATMAP_COOKIE[9] = "tdbatmap";
+
+/*
+ * version 1.1: signed char checksum
+ */
+#define VHD_BATMAP_VERSION(major, minor)  (((major) << 16) | ((minor) & 0x0000FFFF))
+#define VHD_BATMAP_CURRENT_VERSION        VHD_BATMAP_VERSION(1, 2)
+
+/* Layout of a dynamic disk:
+ *
+ * +-------------------------------------------------+
+ * | Mirror image of HD footer (hd_ftr) (512 bytes)  |
+ * +-------------------------------------------------+
+ * | Sparse drive header (dd_hdr) (1024 bytes)       |
+ * +-------------------------------------------------+
+ * | BAT (Block allocation table)                    |
+ * |   - Array of absolute sector offsets into the   |
+ * |     file (u32).                                 |
+ * |   - Rounded up to a sector boundary.            |
+ * |   - Unused entries are marked as 0xFFFFFFFF     |
+ * |   - max entries in dd_hdr->max_bat_size         |
+ * +-------------------------------------------------+
+ * | Data Block 0                                    |
+ * | Bitmap (padded to 512 byte sector boundary)     |
+ * |   - each bit indicates whether the associated   |
+ * |     sector within this block is used.           |
+ * | Data                                            |
+ * |   - power-of-two multiple of sectors.           |
+ * |   - default 2MB (4096 * 512)                    |
+ * |   - Any entries with zero in bitmap should be   |
+ * |     zero on disk                                |
+ * +-------------------------------------------------+
+ * | Data Block 1                                    |
+ * +-------------------------------------------------+
+ * | ...                                             |
+ * +-------------------------------------------------+
+ * | Data Block n                                    |
+ * +-------------------------------------------------+
+ * | HD Footer (511 bytes)                           |
+ * +-------------------------------------------------+
+ */
+
+#endif
diff --git a/tools/blktap2/lvm/Makefile b/tools/blktap2/lvm/Makefile
new file mode 100644 (file)
index 0000000..7d5f8ea
--- /dev/null
@@ -0,0 +1,36 @@
+XEN_ROOT = $(CURDIR)/../../..
+BLKTAP_ROOT := ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+ifeq ($(LVM_UTIL_TEST),y)
+TEST              := lvm-util
+endif
+
+CFLAGS            += -Werror
+CFLAGS            += -Wno-unused
+CFLAGS            += -I../include
+CFLAGS            += -D_GNU_SOURCE
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS            += -fPIC
+endif
+
+LVM-OBJS          := lvm-util.o
+
+all: build
+
+build: $(TEST) $(LVM-OBJS)
+
+install: all
+
+lvm-util: lvm-util.o
+       $(CC) -DLVM_UTIL $(LDFLAGS) -o lvm-util lvm-util.c
+
+clean:
+       rm -rf *.o *.opic *~ $(DEPS) $(IBIN)
+
+distclean: clean
+
+.PHONY: all build clean distclean install lvm-util
+
+-include $(DEPS)
diff --git a/tools/blktap2/lvm/lvm-util.c b/tools/blktap2/lvm/lvm-util.c
new file mode 100644 (file)
index 0000000..b456e04
--- /dev/null
@@ -0,0 +1,349 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "lvm-util.h"
+
+#define _NAME "%255s"
+static char line[1024];
+
+static inline int
+lvm_read_line(FILE *scan)
+{
+       memset(line, 0, sizeof(line));
+       return (fscanf(scan, "%1023[^\n]", line) != 1);
+}
+
+static inline int
+lvm_next_line(FILE *scan)
+{
+       return (fscanf(scan, "%1023[\n]", line) != 1);
+}
+
+static int
+lvm_copy_name(char *dst, const char *src, size_t size)
+{
+       if (strnlen(src, size) == size)
+               return -ENAMETOOLONG;
+
+       strcpy(dst, src);
+       return 0;
+}
+
+static int
+lvm_parse_pv(struct vg *vg, const char *name, int pvs, uint64_t start)
+{
+       int i, err;
+       struct pv *pv;
+
+       pv = NULL;
+
+       if (!vg->pvs) {
+               vg->pvs = calloc(pvs, sizeof(struct pv));
+               if (!vg->pvs)
+                       return -ENOMEM;
+       }
+
+       for (i = 0; i < pvs; i++) {
+               pv = vg->pvs + i;
+
+               if (!pv->name[0])
+                       break;
+
+               if (!strcmp(pv->name, name))
+                       return -EEXIST;
+       }
+
+       if (!pv)
+               return -ENOENT;
+
+       if (i == pvs)
+               return -ENOMEM;
+
+       err = lvm_copy_name(pv->name, name, sizeof(pv->name) - 1);
+       if (err)
+               return err;
+
+       pv->start = start;
+       return 0;
+}
+
+static int
+lvm_open_vg(const char *vgname, struct vg *vg)
+{
+       FILE *scan;
+       int i, err, pvs, lvs;
+       char *cmd, pvname[256];
+       uint64_t size, pv_start;
+
+       memset(vg, 0, sizeof(*vg));
+
+       err = asprintf(&cmd, "/usr/sbin/vgs %s --noheadings --nosuffix --units=b "
+                      "--options=vg_name,vg_extent_size,lv_count,pv_count,"
+                      "pv_name,pe_start --unbuffered 2> /dev/null", vgname);
+       if (err == -1)
+               return -ENOMEM;
+
+       errno = 0;
+       scan  = popen(cmd, "r");
+       if (!scan) {
+               err = (errno ? -errno : ENOMEM);
+               goto out;
+       }
+
+       for (;;) {
+               if (lvm_read_line(scan))
+                       break;
+
+               err = -EINVAL;
+                if (sscanf(line, _NAME" %"SCNu64" %d %d "_NAME" %"SCNu64,
+                          vg->name, &size, &lvs, &pvs, pvname, &pv_start) != 6)
+                       goto out;
+
+               if (strcmp(vg->name, vgname))
+                       goto out;
+
+               err = lvm_parse_pv(vg, pvname, pvs, pv_start);
+               if (err)
+                       goto out;
+
+               if (lvm_next_line(scan))
+                       break;
+       }
+
+       err = -EINVAL;
+       if (strcmp(vg->name, vgname))
+               goto out;
+
+       for (i = 0; i < pvs; i++)
+               if (!vg->pvs[i].name[0])
+                       goto out;
+
+       err = -ENOMEM;
+       vg->lvs = calloc(lvs, sizeof(struct lv));
+       if (!vg->lvs)
+               goto out;
+
+       err             = 0;
+       vg->lv_cnt      = lvs;
+       vg->pv_cnt      = pvs;
+       vg->extent_size = size;
+
+out:
+       if (scan)
+               pclose(scan);
+       if (err)
+               lvm_free_vg(vg);
+       free(cmd);
+       return err;
+}
+
+static int
+lvm_parse_lv_devices(struct vg *vg, struct lv_segment *seg, char *devices)
+{
+       int i;
+       uint64_t start, pe_start;
+
+       for (i = 0; i < strlen(devices); i++)
+               if (strchr(",()", devices[i]))
+                       devices[i] = ' ';
+
+        if (sscanf(devices, _NAME" %"SCNu64, seg->device, &start) != 2)
+               return -EINVAL;
+
+       pe_start = -1;
+       for (i = 0; i < vg->pv_cnt; i++)
+               if (!strcmp(vg->pvs[i].name, seg->device)) {
+                       pe_start = vg->pvs[i].start;
+                       break;
+               }
+
+       if (pe_start == -1)
+               return -EINVAL;
+
+       seg->pe_start = (start * vg->extent_size) + pe_start;
+       return 0;
+}
+
+static int
+lvm_scan_lvs(struct vg *vg)
+{
+       char *cmd;
+       FILE *scan;
+       int i, err;
+
+       err = asprintf(&cmd, "/usr/sbin/lvs %s --noheadings --nosuffix --units=b "
+                      "--options=lv_name,lv_size,segtype,seg_count,seg_start,"
+                      "seg_size,devices --unbuffered 2> /dev/null", vg->name);
+       if (err == -1)
+               return -ENOMEM;
+
+       errno = 0;
+       scan  = popen(cmd, "r");
+       if (!scan) {
+               err = (errno ? -errno : -ENOMEM);
+               goto out;
+       }
+
+       for (i = 0;;) {
+               int segs;
+               struct lv *lv;
+               struct lv_segment seg;
+               uint64_t size, seg_start;
+               char type[32], name[256], dev[256], devices[1024];
+
+               if (i >= vg->lv_cnt)
+                       break;
+
+               if (lvm_read_line(scan)) {
+                       vg->lv_cnt = i;
+                       break;
+               }
+
+               err = -EINVAL;
+               lv  = vg->lvs + i;
+
+                if (sscanf(line, _NAME" %"SCNu64" %31s %u %"SCNu64" %"SCNu64" %1023s",
+                          name, &size, type, &segs, &seg_start,
+                          &seg.pe_size, devices) != 7)
+                       goto out;
+
+               if (seg_start)
+                       goto next;
+
+               if (!strcmp(type, "linear"))
+                       seg.type = LVM_SEG_TYPE_LINEAR;
+               else
+                       seg.type = LVM_SEG_TYPE_UNKNOWN;
+
+               if (lvm_parse_lv_devices(vg, &seg, devices))
+                       goto out;
+
+               i++;
+               lv->size          = size;
+               lv->segments      = segs;
+               lv->first_segment = seg;
+
+               err = lvm_copy_name(lv->name, name, sizeof(lv->name) - 1);
+               if (err)
+                       goto out;
+               err = -EINVAL;
+
+       next:
+               if (lvm_next_line(scan))
+                       goto out;
+       }
+
+       err = 0;
+
+out:
+       if (scan)
+               pclose(scan);
+       free(cmd);
+       return err;
+}
+
+void
+lvm_free_vg(struct vg *vg)
+{
+       free(vg->lvs);
+       free(vg->pvs);
+       memset(vg, 0, sizeof(*vg));
+}
+
+int
+lvm_scan_vg(const char *vg_name, struct vg *vg)
+{
+       int err;
+
+       memset(vg, 0, sizeof(*vg));
+
+       err = lvm_open_vg(vg_name, vg);
+       if (err)
+               return err;
+
+       err = lvm_scan_lvs(vg);
+       if (err) {
+               lvm_free_vg(vg);
+               return err;
+       }
+
+       return 0;
+}
+
+#ifdef LVM_UTIL
+static int
+usage(void)
+{
+       printf("usage: lvm-util <vgname>\n");
+       exit(EINVAL);
+}
+
+int
+main(int argc, char **argv)
+{
+       int i, err;
+       struct vg vg;
+       struct pv *pv;
+       struct lv *lv;
+       struct lv_segment *seg;
+
+       if (argc != 2)
+               usage();
+
+       err = lvm_scan_vg(argv[1], &vg);
+       if (err) {
+               printf("scan failed: %d\n", err);
+               return (err >= 0 ? err : -err);
+       }
+
+       
+        printf("vg %s: extent_size: %"PRIu64", pvs: %d, lvs: %d\n",
+              vg.name, vg.extent_size, vg.pv_cnt, vg.lv_cnt);
+
+       for (i = 0; i < vg.pv_cnt; i++) {
+               pv = vg.pvs + i;
+                printf("pv %s: start %"PRIu64"\n", pv->name, pv->start);
+       }
+
+       for (i = 0; i < vg.lv_cnt; i++) {
+               lv  = vg.lvs + i;
+               seg = &lv->first_segment;                
+                printf("lv %s: size: %"PRIu64", segments: %u, type: %u, "
+                       "dev: %s, pe_start: %"PRIu64", pe_size: %"PRIu64"\n",
+                      lv->name, lv->size, lv->segments, seg->type,
+                      seg->device, seg->pe_start, seg->pe_size);
+       }
+
+       lvm_free_vg(&vg);
+       return 0;
+}
+#endif
diff --git a/tools/blktap2/vhd/Makefile b/tools/blktap2/vhd/Makefile
new file mode 100644 (file)
index 0000000..fabd665
--- /dev/null
@@ -0,0 +1,51 @@
+XEN_ROOT=$(CURDIR)/../../..
+BLKTAP_ROOT := ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+SUBDIRS-y         :=
+SUBDIRS-y         += lib
+
+IBIN               = vhd-util vhd-update
+INST_DIR           = $(sbindir)
+
+CFLAGS            += -Werror
+CFLAGS            += -Wno-unused
+CFLAGS            += -I../include
+CFLAGS            += -D_GNU_SOURCE
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS            += -fPIC
+endif
+
+ifeq ($(VHD_STATIC),y)
+CFLAGS            += -static
+endif
+
+LIBS              := -Llib -lvhd
+
+all: subdirs-all build
+
+build: $(IBIN)
+
+LIBS_DEPENDS     := lib/libvhd.so lib/vhd.a
+$(LIBS_DEPENDS):subdirs-all
+
+vhd-util: vhd-util.o $(LIBS_DEPENDS)
+       $(CC) $(LDFLAGS) -o vhd-util vhd-util.o $(LIBS) $(APPEND_LDFLAGS)
+
+vhd-update: vhd-update.o $(LIBS_DEPENDS)
+       $(CC) $(LDFLAGS) -o vhd-update vhd-update.o $(LIBS) $(APPEND_LDFLAGS)
+
+install: all
+       $(MAKE) subdirs-install
+       $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR)
+
+clean: subdirs-clean
+       rm -rf *.o *~ $(DEPS) $(IBIN)
+
+distclean: clean
+
+.PHONY: all build clean distclean install vhd-util vhd-update
+
+-include $(DEPS)
diff --git a/tools/blktap2/vhd/lib/Makefile b/tools/blktap2/vhd/lib/Makefile
new file mode 100644 (file)
index 0000000..ab2d648
--- /dev/null
@@ -0,0 +1,82 @@
+XEN_ROOT=$(CURDIR)/../../../..
+BLKTAP_ROOT := ../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+LIBVHD-MAJOR     = 1.0
+LIBVHD-MINOR     = 0
+LIBVHD-SONAME    = libvhd.so.$(LIBVHD-MAJOR)
+
+LVM-UTIL-OBJ    := $(BLKTAP_ROOT)/lvm/lvm-util.o
+
+LIBVHD-BUILD    := libvhd.a
+
+INST-DIR         = $(libdir)
+
+CFLAGS          += -Werror
+CFLAGS          += -Wno-unused
+CFLAGS          += -I../../include
+CFLAGS          += -D_GNU_SOURCE
+CFLAGS          += -fPIC
+
+ifeq ($(CONFIG_Linux),y)
+LIBS            := -luuid
+endif
+
+ifeq ($(CONFIG_LIBICONV),y)
+LIBS            += -liconv
+endif
+
+LIB-SRCS        := libvhd.c
+LIB-SRCS        += libvhd-journal.c
+LIB-SRCS        += vhd-util-coalesce.c
+LIB-SRCS        += vhd-util-create.c
+LIB-SRCS        += vhd-util-fill.c
+LIB-SRCS        += vhd-util-modify.c
+LIB-SRCS        += vhd-util-query.c
+LIB-SRCS        += vhd-util-read.c
+LIB-SRCS        += vhd-util-repair.c
+LIB-SRCS        += vhd-util-resize.c
+LIB-SRCS        += vhd-util-revert.c
+LIB-SRCS        += vhd-util-set-field.c
+LIB-SRCS        += vhd-util-snapshot.c
+LIB-SRCS        += vhd-util-scan.c
+LIB-SRCS        += vhd-util-check.c
+LIB-SRCS        += vhd-util-uuid.c
+LIB-SRCS        += relative-path.c
+LIB-SRCS        += atomicio.c
+
+LIB-OBJS         = $(patsubst %.c,%.o,$(LIB-SRCS))
+LIB-OBJS        += $(LVM-UTIL-OBJ)
+
+LIB-PICOBJS      = $(patsubst %.o,%.opic,$(LIB-OBJS))
+
+LIBVHD           = libvhd.a libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
+
+all: build
+
+build: libvhd.a libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
+
+libvhd.a: $(LIB-OBJS)
+       $(AR) rc $@ $^
+
+libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR): $(LIB-PICOBJS)
+       $(CC) -Wl,$(SONAME_LDFLAG),$(LIBVHD-SONAME) $(SHLIB_LDFLAGS) \
+               $(LDFLAGS) -o libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $^ $(LIBS)
+       ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) libvhd.so.$(LIBVHD-MAJOR)
+       ln -sf libvhd.so.$(LIBVHD-MAJOR) libvhd.so
+
+install: all
+       $(INSTALL_DIR) -p $(DESTDIR)$(INST-DIR)
+       $(INSTALL_DATA) libvhd.a $(DESTDIR)$(INST-DIR)
+       $(INSTALL_PROG) libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(DESTDIR)$(INST-DIR)
+       ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(DESTDIR)$(INST-DIR)/libvhd.so.$(LIBVHD-MAJOR)
+       ln -sf libvhd.so.$(LIBVHD-MAJOR) $(DESTDIR)$(INST-DIR)/libvhd.so
+
+clean:
+       rm -rf *.a *.so* *.o *.opic *~ $(DEPS) $(LIBVHD)
+
+distclean: clean
+
+.PHONY: all build clean distclean install libvhd
+
+-include $(DEPS)
diff --git a/tools/blktap2/vhd/lib/atomicio.c b/tools/blktap2/vhd/lib/atomicio.c
new file mode 100644 (file)
index 0000000..ae0e24b
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved.
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t
+atomicio(f, fd, _s, n)
+       ssize_t (*f) (int, void *, size_t);
+       int fd;
+       void *_s;
+       size_t n;
+{
+       char *s = _s;
+       size_t pos = 0;
+       ssize_t res;
+
+       while (n > pos) {
+               res = (f) (fd, s + pos, n - pos);
+               switch (res) {
+               case -1:
+                       if (errno == EINTR || errno == EAGAIN)
+                               continue;
+                       return 0;
+               case 0:
+                       errno = EPIPE;
+                       return pos;
+               default:
+                       pos += (size_t)res;
+               }
+       }
+       return (pos);
+}
+
diff --git a/tools/blktap2/vhd/lib/libvhd-journal.c b/tools/blktap2/vhd/lib/libvhd-journal.c
new file mode 100644 (file)
index 0000000..26e26e7
--- /dev/null
@@ -0,0 +1,1534 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "atomicio.h"
+#include "libvhd-journal.h"
+
+#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_P  1
+#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_C  2
+#define VHD_JOURNAL_ENTRY_TYPE_HEADER    3
+#define VHD_JOURNAL_ENTRY_TYPE_LOCATOR   4
+#define VHD_JOURNAL_ENTRY_TYPE_BAT       5
+#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_H  6
+#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_M  7
+#define VHD_JOURNAL_ENTRY_TYPE_DATA      8
+
+typedef struct vhd_journal_entry {
+       uint64_t                         cookie;
+       uint32_t                         type;
+       uint32_t                         size;
+       uint64_t                         offset;
+       uint32_t                         checksum;
+} vhd_journal_entry_t;
+
+static inline int
+vhd_journal_seek(vhd_journal_t *j, off_t offset, int whence)
+{
+       off_t off;
+
+       off = lseek(j->jfd, offset, whence);
+       if (off == (off_t)-1)
+               return -errno;
+
+       return 0;
+}
+
+static inline off_t
+vhd_journal_position(vhd_journal_t *j)
+{
+       return lseek(j->jfd, 0, SEEK_CUR);
+}
+
+static inline int
+vhd_journal_read(vhd_journal_t *j, void *buf, size_t size)
+{
+       ssize_t ret;
+
+       errno = 0;
+
+       ret = atomicio(read, j->jfd, buf, size);
+       if (ret != size)
+               return (errno ? -errno : -EIO);
+
+       return 0;
+}
+
+static inline int
+vhd_journal_write(vhd_journal_t *j, void *buf, size_t size)
+{
+       ssize_t ret;
+
+       errno = 0;
+
+       ret = atomicio(vwrite, j->jfd, buf, size);
+       if (ret != size)
+               return (errno ? -errno : -EIO);
+
+       return 0;
+}
+
+static inline int
+vhd_journal_truncate(vhd_journal_t *j, off_t length)
+{
+       int err;
+
+       err = ftruncate(j->jfd, length);
+       if (err == -1)
+               return -errno;
+
+       return 0;
+}
+
+static inline int
+vhd_journal_sync(vhd_journal_t *j)
+{
+       int err;
+
+       err = fdatasync(j->jfd);
+       if (err)
+               return -errno;
+
+       return 0;
+}
+
+static inline void
+vhd_journal_header_in(vhd_journal_header_t *header)
+{
+       BE64_IN(&header->vhd_footer_offset);
+       BE32_IN(&header->journal_data_entries);
+       BE32_IN(&header->journal_metadata_entries);
+       BE64_IN(&header->journal_data_offset);
+       BE64_IN(&header->journal_metadata_offset);
+}
+
+static inline void
+vhd_journal_header_out(vhd_journal_header_t *header)
+{
+       BE64_OUT(&header->vhd_footer_offset);
+       BE32_OUT(&header->journal_data_entries);
+       BE32_OUT(&header->journal_metadata_entries);
+       BE64_OUT(&header->journal_data_offset);
+       BE64_OUT(&header->journal_metadata_offset);
+}
+
+static int
+vhd_journal_validate_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+       int err;
+       off_t eof;
+
+       if (memcmp(header->cookie,
+                  VHD_JOURNAL_HEADER_COOKIE, sizeof(header->cookie)))
+               return -EINVAL;
+
+       err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET);
+       if (err)
+               return err;
+
+       eof = vhd_journal_position(j);
+       if (eof == (off_t)-1)
+               return -errno;
+
+       if (j->header.journal_data_offset > j->header.journal_eof)
+               return -EINVAL;
+
+       if (j->header.journal_metadata_offset > j->header.journal_eof)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int
+vhd_journal_read_journal_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+       int err;
+       size_t size;
+
+       size = sizeof(vhd_journal_header_t);
+       err  = vhd_journal_seek(j, 0, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = vhd_journal_read(j, header, size);
+       if (err)
+               return err;
+
+       vhd_journal_header_in(header);
+
+       return vhd_journal_validate_header(j, header);
+}
+
+static int
+vhd_journal_write_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+       int err;
+       size_t size;
+       vhd_journal_header_t h;
+
+       memcpy(&h, header, sizeof(vhd_journal_header_t));
+
+       err = vhd_journal_validate_header(j, &h);
+       if (err)
+               return err;
+
+       vhd_journal_header_out(&h);
+       size = sizeof(vhd_journal_header_t);
+
+       err  = vhd_journal_seek(j, 0, SEEK_SET);
+       if (err)
+               return err;
+
+       err = vhd_journal_write(j, &h, size);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_journal_add_journal_header(vhd_journal_t *j)
+{
+       int err;
+       off_t off;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+       memset(&j->header, 0, sizeof(vhd_journal_header_t));
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err)
+               return err;
+
+       off = vhd_position(vhd);
+       if (off == (off_t)-1)
+               return -errno;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               return err;
+
+       vhd_uuid_copy(&j->header.uuid, &vhd->footer.uuid);
+       memcpy(j->header.cookie,
+              VHD_JOURNAL_HEADER_COOKIE, sizeof(j->header.cookie));
+       j->header.vhd_footer_offset = off - sizeof(vhd_footer_t);
+       j->header.journal_eof = sizeof(vhd_journal_header_t);
+
+       return vhd_journal_write_header(j, &j->header);
+}
+
+static void
+vhd_journal_entry_in(vhd_journal_entry_t *entry)
+{
+       BE32_IN(&entry->type);
+       BE32_IN(&entry->size);
+       BE64_IN(&entry->offset);
+       BE64_IN(&entry->cookie);
+       BE32_IN(&entry->checksum);
+}
+
+static void
+vhd_journal_entry_out(vhd_journal_entry_t *entry)
+{
+       BE32_OUT(&entry->type);
+       BE32_OUT(&entry->size);
+       BE64_OUT(&entry->offset);
+       BE64_OUT(&entry->cookie);
+       BE32_OUT(&entry->checksum);
+}
+
+static uint32_t
+vhd_journal_checksum_entry(vhd_journal_entry_t *entry, char *buf, size_t size)
+{
+       int i;
+       unsigned char *blob;
+       uint32_t checksum, tmp;
+
+       checksum        = 0;
+       tmp             = entry->checksum;
+       entry->checksum = 0;
+
+       blob = (unsigned char *)entry;
+       for (i = 0; i < sizeof(vhd_journal_entry_t); i++)
+               checksum += blob[i];
+
+       blob = (unsigned char *)buf;
+       for (i = 0; i < size; i++)
+               checksum += blob[i];
+
+       entry->checksum = tmp;
+       return ~checksum;
+}
+
+static int
+vhd_journal_validate_entry(vhd_journal_entry_t *entry)
+{
+       if (entry->size == 0)
+               return -EINVAL;
+
+       if (entry->size & (VHD_SECTOR_SIZE - 1))
+               return -EINVAL;
+
+       if (entry->cookie != VHD_JOURNAL_ENTRY_COOKIE)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int
+vhd_journal_read_entry(vhd_journal_t *j, vhd_journal_entry_t *entry)
+{
+       int err;
+
+       err = vhd_journal_read(j, entry, sizeof(vhd_journal_entry_t));
+       if (err)
+               return err;
+
+       vhd_journal_entry_in(entry);
+       return vhd_journal_validate_entry(entry);
+}
+
+static int
+vhd_journal_write_entry(vhd_journal_t *j, vhd_journal_entry_t *entry)
+{
+       int err;
+       vhd_journal_entry_t e;
+
+       err = vhd_journal_validate_entry(entry);
+       if (err)
+               return err;
+
+       memcpy(&e, entry, sizeof(vhd_journal_entry_t));
+       vhd_journal_entry_out(&e);
+
+       err = vhd_journal_write(j, &e, sizeof(vhd_journal_entry_t));
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_journal_validate_entry_data(vhd_journal_entry_t *entry, char *buf)
+{
+       int err;
+       uint32_t checksum;
+
+       err      = 0;
+       checksum = vhd_journal_checksum_entry(entry, buf, entry->size);
+
+       if (checksum != entry->checksum)
+               return -EINVAL;
+
+       return err;
+}
+
+static int
+vhd_journal_update(vhd_journal_t *j, off_t offset,
+                  char *buf, size_t size, uint32_t type)
+{
+       int err;
+       off_t eof;
+       uint64_t *off, off_bak;
+       uint32_t *entries;
+       vhd_journal_entry_t entry;
+
+       entry.type     = type;
+       entry.size     = size;
+       entry.offset   = offset;
+       entry.cookie   = VHD_JOURNAL_ENTRY_COOKIE;
+       entry.checksum = vhd_journal_checksum_entry(&entry, buf, size);
+
+       err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET);
+       if (err)
+               return err;
+
+       err = vhd_journal_write_entry(j, &entry);
+       if (err)
+               goto fail;
+
+       err = vhd_journal_write(j, buf, size);
+       if (err)
+               goto fail;
+
+       if (type == VHD_JOURNAL_ENTRY_TYPE_DATA) {
+               off     = &j->header.journal_data_offset;
+               entries = &j->header.journal_data_entries;
+       } else {
+               off     = &j->header.journal_metadata_offset;
+               entries = &j->header.journal_metadata_entries;
+       }
+
+       off_bak = *off;
+       if (!(*entries)++)
+               *off = j->header.journal_eof;
+       j->header.journal_eof += (size + sizeof(vhd_journal_entry_t));
+
+       err = vhd_journal_write_header(j, &j->header);
+       if (err) {
+               if (!--(*entries))
+                       *off = off_bak;
+               j->header.journal_eof -= (size + sizeof(vhd_journal_entry_t));
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       if (!j->is_block)
+               vhd_journal_truncate(j, j->header.journal_eof);
+       return err;
+}
+
+static int
+vhd_journal_add_footer(vhd_journal_t *j)
+{
+       int err;
+       off_t off;
+       vhd_context_t *vhd;
+       vhd_footer_t footer;
+
+       vhd = &j->vhd;
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err)
+               return err;
+
+       off = vhd_position(vhd);
+       if (off == (off_t)-1)
+               return -errno;
+
+       err = vhd_read_footer_at(vhd, &footer, off - sizeof(vhd_footer_t));
+       if (err)
+               return err;
+
+       vhd_footer_out(&footer);
+       err = vhd_journal_update(j, off - sizeof(vhd_footer_t),
+                                (char *)&footer,
+                                sizeof(vhd_footer_t),
+                                VHD_JOURNAL_ENTRY_TYPE_FOOTER_P);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(vhd))
+               return 0;
+
+       err = vhd_read_footer_at(vhd, &footer, 0);
+       if (err)
+               return err;
+
+       vhd_footer_out(&footer);
+       err = vhd_journal_update(j, 0,
+                                (char *)&footer,
+                                sizeof(vhd_footer_t),
+                                VHD_JOURNAL_ENTRY_TYPE_FOOTER_C);
+
+       return err;
+}
+
+static int
+vhd_journal_add_header(vhd_journal_t *j)
+{
+       int err;
+       off_t off;
+       vhd_context_t *vhd;
+       vhd_header_t header;
+
+       vhd = &j->vhd;
+
+       err = vhd_read_header(vhd, &header);
+       if (err)
+               return err;
+
+       off = vhd->footer.data_offset;
+
+       vhd_header_out(&header);
+       err = vhd_journal_update(j, off,
+                                (char *)&header,
+                                sizeof(vhd_header_t),
+                                VHD_JOURNAL_ENTRY_TYPE_HEADER);
+
+       return err;
+}
+
+static int
+vhd_journal_add_locators(vhd_journal_t *j)
+{
+       int i, n, err;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_get_header(vhd);
+       if (err)
+               return err;
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+       for (i = 0; i < n; i++) {
+               char *buf;
+               off_t off;
+               size_t size;
+               vhd_parent_locator_t *loc;
+
+               loc  = vhd->header.loc + i;
+               err  = vhd_validate_platform_code(loc->code);
+               if (err)
+                       return err;
+
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               off  = loc->data_offset;
+               size = vhd_parent_locator_size(loc);
+
+               err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+               if (err)
+                       return -err;
+
+               err  = vhd_seek(vhd, off, SEEK_SET);
+               if (err)
+                       goto end;
+
+               err  = vhd_read(vhd, buf, size);
+               if (err)
+                       goto end;
+
+               err  = vhd_journal_update(j, off, buf, size,
+                                         VHD_JOURNAL_ENTRY_TYPE_LOCATOR);
+               if (err)
+                       goto end;
+
+               err = 0;
+
+       end:
+               free(buf);
+               if (err)
+                       break;
+       }
+
+       return err;
+}
+
+static int
+vhd_journal_add_bat(vhd_journal_t *j)
+{
+       int err;
+       off_t off;
+       size_t size;
+       vhd_bat_t bat;
+       vhd_context_t *vhd;
+
+       vhd  = &j->vhd;
+
+       err  = vhd_get_header(vhd);
+       if (err)
+               return err;
+
+       err  = vhd_read_bat(vhd, &bat);
+       if (err)
+               return err;
+
+       off  = vhd->header.table_offset;
+       size = vhd_bytes_padded(bat.entries * sizeof(uint32_t));
+
+       vhd_bat_out(&bat);
+       err  = vhd_journal_update(j, off, (char *)bat.bat, size,
+                                 VHD_JOURNAL_ENTRY_TYPE_BAT);
+
+       free(bat.bat);
+       return err;
+}
+
+static int
+vhd_journal_add_batmap(vhd_journal_t *j)
+{
+       int err;
+       off_t off;
+       size_t size;
+       vhd_context_t *vhd;
+       vhd_batmap_t batmap;
+
+       vhd  = &j->vhd;
+
+       err  = vhd_batmap_header_offset(vhd, &off);
+       if (err)
+               return err;
+
+       err  = vhd_read_batmap(vhd, &batmap);
+       if (err)
+               return err;
+
+       size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+       vhd_batmap_header_out(&batmap);
+       err  = vhd_journal_update(j, off, (char *)&batmap.header, size,
+                                 VHD_JOURNAL_ENTRY_TYPE_BATMAP_H);
+       if (err)
+               goto out;
+
+       vhd_batmap_header_in(&batmap);
+       off  = batmap.header.batmap_offset;
+       size = vhd_sectors_to_bytes(batmap.header.batmap_size);
+
+       err  = vhd_journal_update(j, off, batmap.map, size,
+                                 VHD_JOURNAL_ENTRY_TYPE_BATMAP_M);
+
+out:
+       free(batmap.map);
+       return err;
+}
+
+static int
+vhd_journal_add_metadata(vhd_journal_t *j)
+{
+       int err;
+       off_t eof;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_journal_add_footer(j);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(vhd))
+               return 0;
+
+       err = vhd_journal_add_header(j);
+       if (err)
+               return err;
+
+       err = vhd_journal_add_locators(j);
+       if (err)
+               return err;
+
+       err = vhd_journal_add_bat(j);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_journal_add_batmap(j);
+               if (err)
+                       return err;
+       }
+
+       j->header.journal_data_offset = j->header.journal_eof;
+       return vhd_journal_write_header(j, &j->header);
+}
+
+static int
+__vhd_journal_read_footer(vhd_journal_t *j,
+                         vhd_footer_t *footer, uint32_t type)
+{
+       int err;
+       vhd_journal_entry_t entry;
+
+       err = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != type)
+               return -EINVAL;
+
+       if (entry.size != sizeof(vhd_footer_t))
+               return -EINVAL;
+
+       err = vhd_journal_read(j, footer, entry.size);
+       if (err)
+               return err;
+
+       vhd_footer_in(footer);
+       return vhd_validate_footer(footer);
+}
+
+static int
+vhd_journal_read_footer(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return __vhd_journal_read_footer(j, footer,
+                                        VHD_JOURNAL_ENTRY_TYPE_FOOTER_P);
+}
+
+static int
+vhd_journal_read_footer_copy(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return __vhd_journal_read_footer(j, footer,
+                                        VHD_JOURNAL_ENTRY_TYPE_FOOTER_C);
+}
+
+static int
+vhd_journal_read_header(vhd_journal_t *j, vhd_header_t *header)
+{
+       int err;
+       vhd_journal_entry_t entry;
+
+       err = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_HEADER)
+               return -EINVAL;
+
+       if (entry.size != sizeof(vhd_header_t))
+               return -EINVAL;
+
+       err = vhd_journal_read(j, header, entry.size);
+       if (err)
+               return err;
+
+       vhd_header_in(header);
+       return vhd_validate_header(header);
+}
+
+static int
+vhd_journal_read_locators(vhd_journal_t *j, char ***locators, int *locs)
+{
+       int err, n, _locs;
+       char **_locators, *buf;
+       off_t pos;
+       vhd_journal_entry_t entry;
+
+       _locs     = 0;
+       *locs     = 0;
+       *locators = NULL;
+
+       n = sizeof(j->vhd.header.loc) / sizeof(vhd_parent_locator_t);
+       _locators = calloc(n, sizeof(char *));
+       if (!_locators)
+               return -ENOMEM;
+
+       for (;;) {
+               buf = NULL;
+
+               pos = vhd_journal_position(j);
+               err = vhd_journal_read_entry(j, &entry);
+               if (err)
+                       goto fail;
+
+               if (entry.type != VHD_JOURNAL_ENTRY_TYPE_LOCATOR) {
+                       err = vhd_journal_seek(j, pos, SEEK_SET);
+                       if (err)
+                               goto fail;
+                       break;
+               }
+
+               if (_locs >= n) {
+                       err = -EINVAL;
+                       goto fail;
+               }
+
+               err = posix_memalign((void **)&buf,
+                                    VHD_SECTOR_SIZE, entry.size);
+               if (err) {
+                       err = -err;
+                       buf = NULL;
+                       goto fail;
+               }
+
+               err = vhd_journal_read(j, buf, entry.size);
+               if (err)
+                       goto fail;
+
+               _locators[_locs++] = buf;
+               err                = 0;
+       }
+
+
+       *locs     = _locs;
+       *locators = _locators;
+
+       return 0;
+
+fail:
+       if (_locators) {
+               for (n = 0; n < _locs; n++)
+                       free(_locators[n]);
+               free(_locators);
+       }
+       return err;
+}
+
+static int
+vhd_journal_read_bat(vhd_journal_t *j, vhd_bat_t *bat)
+{
+       int err;
+       size_t size;
+       vhd_context_t *vhd;
+       vhd_journal_entry_t entry;
+
+       vhd  = &j->vhd;
+
+       size = vhd_bytes_padded(vhd->header.max_bat_size * sizeof(uint32_t));
+
+       err  = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BAT)
+               return -EINVAL;
+
+       if (entry.size != size)
+               return -EINVAL;
+
+       if (entry.offset != vhd->header.table_offset)
+               return -EINVAL;
+
+       err = posix_memalign((void **)&bat->bat, VHD_SECTOR_SIZE, size);
+       if (err)
+               return -err;
+
+       err = vhd_journal_read(j, bat->bat, entry.size);
+       if (err)
+               goto fail;
+
+       bat->spb     = vhd->header.block_size >> VHD_SECTOR_SHIFT;
+       bat->entries = vhd->header.max_bat_size;
+       vhd_bat_in(bat);
+
+       return 0;
+
+fail:
+       free(bat->bat);
+       bat->bat = NULL;
+       return err;
+}
+
+static int
+vhd_journal_read_batmap_header(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       int err;
+       char *buf;
+       size_t size;
+       vhd_journal_entry_t entry;
+
+       size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+       err  = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_H)
+               return -EINVAL;
+
+       if (entry.size != size)
+               return -EINVAL;
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err)
+               return err;
+
+       err = vhd_journal_read(j, buf, entry.size);
+       if (err) {
+               free(buf);
+               return err;
+       }
+
+       memcpy(&batmap->header, buf, sizeof(batmap->header));
+
+       vhd_batmap_header_in(batmap);
+       return vhd_validate_batmap_header(batmap);
+}
+
+static int
+vhd_journal_read_batmap_map(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       int err;
+       vhd_journal_entry_t entry;
+
+       err  = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_M)
+               return -EINVAL;
+
+       if (entry.size != vhd_sectors_to_bytes(batmap->header.batmap_size))
+               return -EINVAL;
+
+       if (entry.offset != batmap->header.batmap_offset)
+               return -EINVAL;
+
+       err = posix_memalign((void **)&batmap->map,
+                            VHD_SECTOR_SIZE, entry.size);
+       if (err)
+               return -err;
+
+       err = vhd_journal_read(j, batmap->map, entry.size);
+       if (err) {
+               free(batmap->map);
+               batmap->map = NULL;
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_journal_read_batmap(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       int err;
+
+       err = vhd_journal_read_batmap_header(j, batmap);
+       if (err)
+               return err;
+
+       err = vhd_journal_read_batmap_map(j, batmap);
+       if (err)
+               return err;
+
+       err = vhd_validate_batmap(batmap);
+       if (err) {
+               free(batmap->map);
+               batmap->map = NULL;
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_journal_restore_footer(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return vhd_write_footer_at(&j->vhd, footer,
+                                  j->header.vhd_footer_offset);
+}
+
+static int
+vhd_journal_restore_footer_copy(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return vhd_write_footer_at(&j->vhd, footer, 0);
+}
+
+static int
+vhd_journal_restore_header(vhd_journal_t *j, vhd_header_t *header)
+{
+       off_t off;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+       off = vhd->footer.data_offset;
+
+       return vhd_write_header_at(&j->vhd, header, off);
+}
+
+static int
+vhd_journal_restore_locators(vhd_journal_t *j, char **locators, int locs)
+{
+       size_t size;
+       vhd_context_t *vhd;
+       int i, n, lidx, err;
+       vhd_parent_locator_t *loc;
+
+       lidx = 0;
+       vhd  = &j->vhd;
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+
+       for (i = 0; i < n && lidx < locs; i++) {
+               loc  = vhd->header.loc + i;
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               err  = vhd_seek(vhd, loc->data_offset, SEEK_SET);
+               if (err)
+                       return err;
+
+               size = vhd_parent_locator_size(loc);
+               err  = vhd_write(vhd, locators[lidx++], size);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_journal_restore_bat(vhd_journal_t *j, vhd_bat_t *bat)
+{
+       return vhd_write_bat(&j->vhd, bat);
+}
+
+static int
+vhd_journal_restore_batmap(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       return vhd_write_batmap(&j->vhd, batmap);
+}
+
+static int
+vhd_journal_restore_metadata(vhd_journal_t *j)
+{
+       off_t off;
+       char **locators;
+       vhd_footer_t copy;
+       vhd_context_t *vhd;
+       int i, locs, hlocs, err;
+
+       vhd      = &j->vhd;
+       locs     = 0;
+       hlocs    = 0;
+       locators = NULL;
+
+       err = vhd_journal_seek(j, sizeof(vhd_journal_header_t), SEEK_SET);
+       if (err)
+               return err;
+
+       err  = vhd_journal_read_footer(j, &vhd->footer);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(vhd))
+               goto restore;
+
+       err  = vhd_journal_read_footer_copy(j, &copy);
+       if (err)
+               return err;
+
+       err  = vhd_journal_read_header(j, &vhd->header);
+       if (err)
+               return err;
+
+       for (hlocs = 0, i = 0; i < vhd_parent_locator_count(vhd); i++) {
+               if (vhd_validate_platform_code(vhd->header.loc[i].code))
+                       return err;
+
+               if (vhd->header.loc[i].code != PLAT_CODE_NONE)
+                       hlocs++;
+       }
+
+       if (hlocs) {
+               err  = vhd_journal_read_locators(j, &locators, &locs);
+               if (err)
+                       return err;
+
+               if (hlocs != locs) {
+                       err = -EINVAL;
+                       goto out;
+               }
+       }
+
+       err  = vhd_journal_read_bat(j, &vhd->bat);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(vhd)) {
+               err  = vhd_journal_read_batmap(j, &vhd->batmap);
+               if (err)
+                       goto out;
+       }
+
+restore:
+       off  = vhd_journal_position(j);
+       if (off == (off_t)-1)
+               return -errno;
+
+       if (j->header.journal_data_offset != off)
+               return -EINVAL;
+
+       err  = vhd_journal_restore_footer(j, &vhd->footer);
+       if (err)
+               goto out;
+
+       if (!vhd_type_dynamic(vhd))
+               goto out;
+
+       err  = vhd_journal_restore_footer_copy(j, &copy);
+       if (err)
+               goto out;
+
+       err  = vhd_journal_restore_header(j, &vhd->header);
+       if (err)
+               goto out;
+
+       if (locs) {
+               err = vhd_journal_restore_locators(j, locators, locs);
+               if (err)
+                       goto out;
+       }
+
+       err  = vhd_journal_restore_bat(j, &vhd->bat);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(vhd)) {
+               err  = vhd_journal_restore_batmap(j, &vhd->batmap);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+out:
+       if (locators) {
+               for (i = 0; i < locs; i++)
+                       free(locators[i]);
+               free(locators);
+       }
+
+       if (!err && !vhd->is_block)
+               err = ftruncate(vhd->fd,
+                         j->header.vhd_footer_offset +
+                         sizeof(vhd_footer_t));
+
+       return err;
+}
+
+static int
+vhd_journal_disable_vhd(vhd_journal_t *j)
+{
+       int err;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               return err;
+
+       memcpy(&vhd->footer.cookie,
+              VHD_POISON_COOKIE, sizeof(vhd->footer.cookie));
+       vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_journal_enable_vhd(vhd_journal_t *j)
+{
+       int err;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               return err;
+
+       if (!vhd_disabled(vhd))
+               return 0;
+
+       memcpy(&vhd->footer.cookie, HD_COOKIE, sizeof(vhd->footer.cookie));
+       vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+int
+vhd_journal_close(vhd_journal_t *j)
+{
+       if (j->jfd)
+               close(j->jfd);
+
+       vhd_close(&j->vhd);
+       free(j->jname);
+
+       return 0;
+}
+
+int
+vhd_journal_remove(vhd_journal_t *j)
+{
+       int err;
+
+       err = vhd_journal_enable_vhd(j);
+       if (err)
+               return err;
+
+       if (j->jfd) {
+               close(j->jfd);
+               if (!j->is_block)
+                       unlink(j->jname);
+       }
+
+       vhd_close(&j->vhd);
+       free(j->jname);
+
+       return 0;
+}
+
+int
+vhd_journal_open(vhd_journal_t *j, const char *file, const char *jfile)
+{
+       int err;
+       vhd_context_t *vhd;
+
+       memset(j, 0, sizeof(vhd_journal_t));
+
+       j->jfd = -1;
+       vhd    = &j->vhd;
+
+       j->jname = strdup(jfile);
+       if (j->jname == NULL)
+               return -ENOMEM;
+
+       j->jfd = open(j->jname, O_LARGEFILE | O_RDWR);
+       if (j->jfd == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = vhd_test_file_fixed(j->jname, &j->is_block);
+       if (err)
+               goto fail;
+
+       vhd->fd = open(file, O_LARGEFILE | O_RDWR | O_DIRECT);
+       if (vhd->fd == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = vhd_test_file_fixed(file, &vhd->is_block);
+       if (err)
+               goto fail;
+
+       err = vhd_journal_read_journal_header(j, &j->header);
+       if (err)
+               goto fail;
+
+       err = vhd_journal_restore_metadata(j);
+       if (err)
+               goto fail;
+
+       close(vhd->fd);
+       free(vhd->bat.bat);
+       free(vhd->batmap.map);
+
+       err = vhd_open(vhd, file, VHD_OPEN_RDWR);
+       if (err)
+               goto fail;
+
+       err = vhd_get_bat(vhd);
+       if (err)
+               goto fail;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_get_batmap(vhd);
+               if (err)
+                       goto fail;
+       }
+
+       err = vhd_journal_disable_vhd(j);
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       vhd_journal_close(j);
+       return err;
+}
+
+int
+vhd_journal_create(vhd_journal_t *j, const char *file, const char *jfile)
+{
+       char *buf;
+       int i, err;
+       size_t size;
+       off_t off;
+       struct stat stats;
+
+       memset(j, 0, sizeof(vhd_journal_t));
+       j->jfd = -1;
+
+       j->jname = strdup(jfile);
+       if (j->jname == NULL) {
+               err = -ENOMEM;
+               goto fail1;
+       }
+
+       if (access(j->jname, F_OK) == 0) {
+               err = vhd_test_file_fixed(j->jname, &j->is_block);
+               if (err)
+                       goto fail1;
+
+               if (!j->is_block) {
+                       err = -EEXIST;
+                       goto fail1;
+               }
+       }
+
+       if (j->is_block)
+               j->jfd = open(j->jname, O_LARGEFILE | O_RDWR, 0644);
+       else
+               j->jfd = open(j->jname,
+                             O_CREAT | O_TRUNC | O_LARGEFILE | O_RDWR, 0644);
+       if (j->jfd == -1) {
+               err = -errno;
+               goto fail1;
+       }
+
+       err = vhd_open(&j->vhd, file, VHD_OPEN_RDWR | VHD_OPEN_STRICT);
+       if (err)
+               goto fail1;
+
+       err = vhd_get_bat(&j->vhd);
+       if (err)
+               goto fail2;
+
+       if (vhd_has_batmap(&j->vhd)) {
+               err = vhd_get_batmap(&j->vhd);
+               if (err)
+                       goto fail2;
+       }
+
+       err = vhd_journal_add_journal_header(j);
+       if (err)
+               goto fail2;
+
+       err = vhd_journal_add_metadata(j);
+       if (err)
+               goto fail2;
+
+       err = vhd_journal_disable_vhd(j);
+       if (err)
+               goto fail2;
+
+       err = vhd_journal_sync(j);
+       if (err)
+               goto fail2;
+
+       return 0;
+
+fail1:
+       if (j->jfd != -1) {
+               close(j->jfd);
+               if (!j->is_block)
+                       unlink(j->jname);
+       }
+       free(j->jname);
+       memset(j, 0, sizeof(vhd_journal_t));
+
+       return err;
+
+fail2:
+       vhd_journal_remove(j);
+       return err;
+}
+
+int
+vhd_journal_add_block(vhd_journal_t *j, uint32_t block, char mode)
+{
+       int err;
+       char *buf;
+       off_t off;
+       size_t size;
+       uint64_t blk;
+       vhd_context_t *vhd;
+
+       buf = NULL;
+       vhd = &j->vhd;
+
+       if (!vhd_type_dynamic(vhd))
+               return -EINVAL;
+
+       err = vhd_get_bat(vhd);
+       if (err)
+               return err;
+
+       if (block >= vhd->bat.entries)
+               return -ERANGE;
+
+       blk = vhd->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return 0;
+
+       off = vhd_sectors_to_bytes(blk);
+
+       if (mode & VHD_JOURNAL_METADATA) {
+               size = vhd_sectors_to_bytes(vhd->bm_secs);
+
+               err  = vhd_read_bitmap(vhd, block, &buf);
+               if (err)
+                       return err;
+
+               err  = vhd_journal_update(j, off, buf, size,
+                                         VHD_JOURNAL_ENTRY_TYPE_DATA);
+
+               free(buf);
+
+               if (err)
+                       return err;
+       }
+
+       if (mode & VHD_JOURNAL_DATA) {
+               off += vhd_sectors_to_bytes(vhd->bm_secs);
+               size = vhd_sectors_to_bytes(vhd->spb);
+
+               err  = vhd_read_block(vhd, block, &buf);
+               if (err)
+                       return err;
+
+               err  = vhd_journal_update(j, off, buf, size,
+                                         VHD_JOURNAL_ENTRY_TYPE_DATA);
+               free(buf);
+
+               if (err)
+                       return err;
+       }
+
+       return vhd_journal_sync(j);
+}
+
+/*
+ * commit indicates the transaction completed 
+ * successfully and we can remove the undo log
+ */
+int
+vhd_journal_commit(vhd_journal_t *j)
+{
+       int err;
+
+       j->header.journal_data_entries     = 0;
+       j->header.journal_metadata_entries = 0;
+       j->header.journal_data_offset      = 0;
+       j->header.journal_metadata_offset  = 0;
+
+       err = vhd_journal_write_header(j, &j->header);
+       if (err)
+               return err;
+
+       if (!j->is_block)
+               err = vhd_journal_truncate(j, sizeof(vhd_journal_header_t));
+       if (err)
+               return -errno;
+
+       return 0;
+}
+
+/*
+ * revert indicates the transaction failed
+ * and we should revert any changes via the undo log
+ */
+int
+vhd_journal_revert(vhd_journal_t *j)
+{
+       int i, err;
+       char *buf, *file;
+       vhd_context_t *vhd;
+       vhd_journal_entry_t entry;
+
+       err  = 0;
+       vhd  = &j->vhd;
+       buf  = NULL;
+
+       file = strdup(vhd->file);
+       if (!file)
+               return -ENOMEM;
+
+       vhd_close(&j->vhd);
+       j->vhd.fd = open(file, O_RDWR | O_DIRECT | O_LARGEFILE);
+       if (j->vhd.fd == -1) {
+               free(file);
+               return -errno;
+       }
+
+       err = vhd_test_file_fixed(file, &vhd->is_block);
+       if (err) {
+               free(file);
+               return err;
+       }
+
+       err  = vhd_journal_restore_metadata(j);
+       if (err) {
+               free(file);
+               return err;
+       }
+
+       close(vhd->fd);
+       free(vhd->bat.bat);
+       free(vhd->batmap.map);
+
+       err = vhd_open(vhd, file, VHD_OPEN_RDWR);
+       free(file);
+       if (err)
+               return err;
+
+       err = vhd_journal_seek(j, j->header.journal_data_offset, SEEK_SET);
+       if (err)
+               return err;
+
+       for (i = 0; i < j->header.journal_data_entries; i++) {
+               err = vhd_journal_read_entry(j, &entry);
+               if (err)
+                       goto end;
+
+               err = posix_memalign((void **)&buf,
+                                    VHD_SECTOR_SIZE, entry.size);
+               if (err) {
+                       err = -err;
+                       buf = NULL;
+                       goto end;
+               }
+
+               err = vhd_journal_read(j, buf, entry.size);
+               if (err)
+                       goto end;
+
+               err = vhd_journal_validate_entry_data(&entry, buf);
+               if (err)
+                       goto end;
+
+               err = vhd_seek(vhd, entry.offset, SEEK_SET);
+               if (err)
+                       goto end;
+
+               err = vhd_write(vhd, buf, entry.size);
+               if (err)
+                       goto end;
+
+               err = 0;
+
+       end:
+               free(buf);
+               buf = NULL;
+               if (err)
+                       break;
+       }
+
+       if (err)
+               return err;
+
+       if (!vhd->is_block) {
+               err = ftruncate(vhd->fd, j->header.vhd_footer_offset +
+                               sizeof(vhd_footer_t));
+               if (err)
+                       return -errno;
+       }
+
+       return vhd_journal_sync(j);
+}
diff --git a/tools/blktap2/vhd/lib/libvhd.c b/tools/blktap2/vhd/lib/libvhd.c
new file mode 100644 (file)
index 0000000..4ebe012
--- /dev/null
@@ -0,0 +1,3348 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <libgen.h>
+#include <iconv.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <langinfo.h>
+
+#include "libvhd.h"
+#include "relative-path.h"
+
+/* VHD uses an epoch of 12:00AM, Jan 1, 2000. This is the Unix timestamp for
+ * the start of the VHD epoch. */
+#define VHD_EPOCH_START 946684800
+
+static int libvhd_dbg = 0;
+
+void
+libvhd_set_log_level(int level)
+{
+       if (level)
+               libvhd_dbg = 1;
+}
+
+#define VHDLOG(_f, _a...)                                              \
+       do {                                                            \
+               if (libvhd_dbg)                                         \
+                       syslog(LOG_INFO, "libvhd::%s: "_f,              \
+                              __func__, ##_a);                         \
+       } while (0)
+
+#define BIT_MASK 0x80
+
+#ifdef ENABLE_FAILURE_TESTING
+const char* ENV_VAR_FAIL[NUM_FAIL_TESTS] = {
+       "VHD_UTIL_TEST_FAIL_REPARENT_BEGIN",
+       "VHD_UTIL_TEST_FAIL_REPARENT_LOCATOR",
+       "VHD_UTIL_TEST_FAIL_REPARENT_END",
+       "VHD_UTIL_TEST_FAIL_RESIZE_BEGIN",
+       "VHD_UTIL_TEST_FAIL_RESIZE_DATA_MOVED",
+       "VHD_UTIL_TEST_FAIL_RESIZE_METADATA_MOVED",
+       "VHD_UTIL_TEST_FAIL_RESIZE_END"
+};
+int TEST_FAIL[NUM_FAIL_TESTS];
+#endif // ENABLE_FAILURE_TESTING
+
+static inline int
+test_bit (volatile char *addr, int nr)
+{
+       return ((addr[nr >> 3] << (nr & 7)) & BIT_MASK) != 0;
+}
+
+static inline void
+set_bit (volatile char *addr, int nr)
+{
+       addr[nr >> 3] |= (BIT_MASK >> (nr & 7));
+}
+
+static inline void
+clear_bit (volatile char *addr, int nr)
+{
+       addr[nr >> 3] &= ~(BIT_MASK >> (nr & 7));
+}
+
+static inline int
+old_test_bit(volatile char *addr, int nr)
+{
+       return (((uint32_t *)addr)[nr >> 5] >> (nr & 31)) & 1;
+}
+
+static inline void
+old_set_bit(volatile char *addr, int nr)
+{
+       ((uint32_t *)addr)[nr >> 5] |= (1 << (nr & 31));
+}
+
+static inline void
+old_clear_bit(volatile char *addr, int nr)
+{
+       ((uint32_t *)addr)[nr >> 5] &= ~(1 << (nr & 31));
+}
+
+void
+vhd_footer_in(vhd_footer_t *footer)
+{
+       BE32_IN(&footer->features);
+       BE32_IN(&footer->ff_version);
+       BE64_IN(&footer->data_offset);
+       BE32_IN(&footer->timestamp);
+       BE32_IN(&footer->crtr_ver);
+       BE32_IN(&footer->crtr_os);
+       BE64_IN(&footer->orig_size);
+       BE64_IN(&footer->curr_size);
+       BE32_IN(&footer->geometry);
+       BE32_IN(&footer->type);
+       BE32_IN(&footer->checksum);
+}
+
+void
+vhd_footer_out(vhd_footer_t *footer)
+{
+       BE32_OUT(&footer->features);
+       BE32_OUT(&footer->ff_version);
+       BE64_OUT(&footer->data_offset);
+       BE32_OUT(&footer->timestamp);
+       BE32_OUT(&footer->crtr_ver);
+       BE32_OUT(&footer->crtr_os);
+       BE64_OUT(&footer->orig_size);
+       BE64_OUT(&footer->curr_size);
+       BE32_OUT(&footer->geometry);
+       BE32_OUT(&footer->type);
+       BE32_OUT(&footer->checksum);
+}
+
+void
+vhd_header_in(vhd_header_t *header)
+{
+       int i, n;
+
+       BE64_IN(&header->data_offset);
+       BE64_IN(&header->table_offset);
+       BE32_IN(&header->hdr_ver);
+       BE32_IN(&header->max_bat_size);
+       BE32_IN(&header->block_size);
+       BE32_IN(&header->checksum);
+       BE32_IN(&header->prt_ts);
+
+       n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+
+       for (i = 0; i < n; i++) {
+               BE32_IN(&header->loc[i].code);
+               BE32_IN(&header->loc[i].data_space);
+               BE32_IN(&header->loc[i].data_len);
+               BE64_IN(&header->loc[i].data_offset);
+       }
+}
+
+void
+vhd_header_out(vhd_header_t *header)
+{
+       int i, n;
+
+       BE64_OUT(&header->data_offset);
+       BE64_OUT(&header->table_offset);
+       BE32_OUT(&header->hdr_ver);
+       BE32_OUT(&header->max_bat_size);
+       BE32_OUT(&header->block_size);
+       BE32_OUT(&header->checksum);
+       BE32_OUT(&header->prt_ts);
+
+       n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+
+       for (i = 0; i < n; i++) {
+               BE32_OUT(&header->loc[i].code);
+               BE32_OUT(&header->loc[i].data_space);
+               BE32_OUT(&header->loc[i].data_len);
+               BE64_OUT(&header->loc[i].data_offset);
+       }
+}
+
+void
+vhd_batmap_header_in(vhd_batmap_t *batmap)
+{
+       BE64_IN(&batmap->header.batmap_offset);
+       BE32_IN(&batmap->header.batmap_size);
+       BE32_IN(&batmap->header.batmap_version);
+       BE32_IN(&batmap->header.checksum);
+}
+
+void
+vhd_batmap_header_out(vhd_batmap_t *batmap)
+{
+       BE64_OUT(&batmap->header.batmap_offset);
+       BE32_OUT(&batmap->header.batmap_size);
+       BE32_OUT(&batmap->header.batmap_version);
+       BE32_OUT(&batmap->header.checksum);
+}
+
+void
+vhd_bat_in(vhd_bat_t *bat)
+{
+       int i;
+
+       for (i = 0; i < bat->entries; i++)
+               BE32_IN(&bat->bat[i]);
+}
+
+void
+vhd_bat_out(vhd_bat_t *bat)
+{
+       int i;
+
+       for (i = 0; i < bat->entries; i++)
+               BE32_OUT(&bat->bat[i]);
+}
+
+uint32_t
+vhd_checksum_footer(vhd_footer_t *footer)
+{
+       int i;
+       unsigned char *blob;
+       uint32_t checksum, tmp;
+
+       checksum         = 0;
+       tmp              = footer->checksum;
+       footer->checksum = 0;
+
+       blob = (unsigned char *)footer;
+       for (i = 0; i < sizeof(vhd_footer_t); i++)
+               checksum += (uint32_t)blob[i];
+
+       footer->checksum = tmp;
+       return ~checksum;
+}
+
+int
+vhd_validate_footer(vhd_footer_t *footer)
+{
+       int csize;
+       uint32_t checksum;
+
+       csize = sizeof(footer->cookie);
+       if (memcmp(footer->cookie, HD_COOKIE, csize) != 0 &&
+           memcmp(footer->cookie, VHD_POISON_COOKIE, csize) != 0) {
+               char buf[9];
+               strncpy(buf, footer->cookie, sizeof(buf));
+               buf[sizeof(buf)-1]= '\0';
+               VHDLOG("invalid footer cookie: %s\n", buf);
+               return -EINVAL;
+       }
+
+       checksum = vhd_checksum_footer(footer);
+       if (checksum != footer->checksum) {
+               /*
+                * early td-util did not re-calculate
+                * checksum when marking vhds 'hidden'
+                */
+               if (footer->hidden &&
+                   !strncmp(footer->crtr_app, "tap", 3) &&
+                   (footer->crtr_ver == VHD_VERSION(0, 1) ||
+                    footer->crtr_ver == VHD_VERSION(1, 1))) {
+                       char tmp = footer->hidden;
+                       footer->hidden = 0;
+                       checksum = vhd_checksum_footer(footer);
+                       footer->hidden = tmp;
+
+                       if (checksum == footer->checksum)
+                               return 0;
+               }
+
+               VHDLOG("invalid footer checksum: "
+                      "footer = 0x%08x, calculated = 0x%08x\n",
+                      footer->checksum, checksum);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+uint32_t
+vhd_checksum_header(vhd_header_t *header)
+{
+       int i;
+       unsigned char *blob;
+       uint32_t checksum, tmp;
+
+       checksum         = 0;
+       tmp              = header->checksum;
+       header->checksum = 0;
+
+       blob = (unsigned char *)header;
+       for (i = 0; i < sizeof(vhd_header_t); i++)
+               checksum += (uint32_t)blob[i];
+
+       header->checksum = tmp;
+       return ~checksum;
+}
+
+int
+vhd_validate_header(vhd_header_t *header)
+{
+       int i, n;
+       uint32_t checksum;
+
+       if (memcmp(header->cookie, DD_COOKIE, 8) != 0) {
+               char buf[9];
+               strncpy(buf, header->cookie, sizeof(buf));
+               buf[sizeof(buf)-1]= '\0';
+               VHDLOG("invalid header cookie: %s\n", buf);
+               return -EINVAL;
+       }
+
+       if (header->hdr_ver != 0x00010000) {
+               VHDLOG("invalid header version 0x%08x\n", header->hdr_ver);
+               return -EINVAL;
+       }
+
+       if (header->data_offset != 0xFFFFFFFFFFFFFFFF) {
+               VHDLOG("invalid header data_offset 0x%016"PRIx64"\n",
+                      header->data_offset);
+               return -EINVAL;
+       }
+
+       n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+       for (i = 0; i < n; i++)
+               if (vhd_validate_platform_code(header->loc[i].code))
+                       return -EINVAL;
+
+       checksum = vhd_checksum_header(header);
+       if (checksum != header->checksum) {
+               VHDLOG("invalid header checksum: "
+                      "header = 0x%08x, calculated = 0x%08x\n",
+                      header->checksum, checksum);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static inline int
+vhd_validate_bat(vhd_bat_t *bat)
+{
+       if (!bat->bat)
+               return -EINVAL;
+
+       return 0;
+}
+
+uint32_t
+vhd_checksum_batmap(vhd_batmap_t *batmap)
+{
+       int i, n;
+       char *blob;
+       uint32_t checksum;
+
+       blob     = batmap->map;
+       checksum = 0;
+
+       n = vhd_sectors_to_bytes(batmap->header.batmap_size);
+
+       for (i = 0; i < n; i++) {
+               if (batmap->header.batmap_version == VHD_BATMAP_VERSION(1, 1))
+                       checksum += (uint32_t)blob[i];
+               else
+                       checksum += (uint32_t)(unsigned char)blob[i];
+       }
+
+       return ~checksum;
+}
+
+int
+vhd_validate_batmap_header(vhd_batmap_t *batmap)
+{
+       if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, 8))
+               return -EINVAL;
+
+       if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION)
+               return -EINVAL;
+
+       return 0;
+}
+
+int
+vhd_validate_batmap(vhd_batmap_t *batmap)
+{
+       uint32_t checksum;
+
+       if (!batmap->map)
+               return -EINVAL;
+
+       checksum = vhd_checksum_batmap(batmap);
+       if (checksum != batmap->header.checksum)
+               return -EINVAL;
+
+       return 0;
+}
+
+int
+vhd_batmap_header_offset(vhd_context_t *ctx, off_t *_off)
+{
+       off_t off;
+       size_t  bat;
+
+       *_off = 0;
+
+       off  = ctx->header.table_offset;
+       bat  = ctx->header.max_bat_size * sizeof(uint32_t);
+       off += vhd_bytes_padded(bat);
+
+       *_off = off;
+       return 0;
+}
+
+int
+vhd_validate_platform_code(uint32_t code)
+{
+       switch (code) {
+       case PLAT_CODE_NONE:
+       case PLAT_CODE_WI2R:
+       case PLAT_CODE_WI2K:
+       case PLAT_CODE_W2RU:
+       case PLAT_CODE_W2KU:
+       case PLAT_CODE_MAC:
+       case PLAT_CODE_MACX:
+               return 0;
+       default:
+               VHDLOG("invalid parent locator code %u\n", code);
+               return -EINVAL;
+       }
+}
+
+int
+vhd_parent_locator_count(vhd_context_t *ctx)
+{
+       return (sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t));
+}
+
+int
+vhd_hidden(vhd_context_t *ctx, int *hidden)
+{
+       int err;
+
+       *hidden = 0;
+
+       if (vhd_type_dynamic(ctx) && vhd_creator_tapdisk(ctx) &&
+           (ctx->footer.crtr_ver == VHD_VERSION(0, 1) ||
+            ctx->footer.crtr_ver == VHD_VERSION(1, 1))) {
+               vhd_footer_t copy;
+
+               err = vhd_read_footer_at(ctx, &copy, 0);
+               if (err) {
+                       VHDLOG("error reading backup footer of %s: %d\n",
+                              ctx->file, err);
+                       return err;
+               }
+               *hidden = copy.hidden;
+       } else
+               *hidden = ctx->footer.hidden;
+
+       return 0;
+}
+
+int
+vhd_chain_depth(vhd_context_t *ctx, int *depth)
+{
+       char *file;
+       int err, cnt;
+       vhd_context_t vhd, *cur;
+
+       err    = 0;
+       cnt    = 0;
+       *depth = 0;
+       file   = NULL;
+       cur    = ctx;
+
+       for (;;) {
+               cnt++;
+
+               if (cur->footer.type != HD_TYPE_DIFF)
+                       break;
+
+               if (vhd_parent_raw(cur)) {
+                       cnt++;
+                       break;
+               }
+
+               err = vhd_parent_locator_get(cur, &file);
+               if (err) {
+                       file = NULL;
+                       break;
+               }
+
+               if (cur != ctx) {
+                       vhd_close(cur);
+                       cur = NULL;
+               }
+
+               err = vhd_open(&vhd, file, VHD_OPEN_RDONLY);
+               if (err)
+                       break;
+
+               cur = &vhd;
+               free(file);
+               file = NULL;
+       }
+
+       free(file);
+       if (cur && cur != ctx)
+               vhd_close(cur);
+
+       if (!err)
+               *depth = cnt;
+
+       return err;
+}
+
+int
+vhd_batmap_test(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+       if (!vhd_has_batmap(ctx) || !batmap->map)
+               return 0;
+
+       if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+               return 0;
+
+       return test_bit(batmap->map, block);
+}
+
+void
+vhd_batmap_set(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+       if (!vhd_has_batmap(ctx) || !batmap->map)
+               return;
+
+       if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+               return;
+
+       set_bit(batmap->map, block);
+}
+
+void
+vhd_batmap_clear(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+       if (!vhd_has_batmap(ctx) || !batmap->map)
+               return;
+
+       if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+               return;
+
+       clear_bit(batmap->map, block);
+}
+
+int
+vhd_bitmap_test(vhd_context_t *ctx, char *map, uint32_t block)
+{
+       if (vhd_creator_tapdisk(ctx) &&
+           ctx->footer.crtr_ver == 0x00000001)
+               return old_test_bit(map, block);
+
+       return test_bit(map, block);
+}
+
+void
+vhd_bitmap_set(vhd_context_t *ctx, char *map, uint32_t block)
+{
+       if (vhd_creator_tapdisk(ctx) &&
+           ctx->footer.crtr_ver == 0x00000001)
+               return old_set_bit(map, block);
+
+       return set_bit(map, block);
+}
+
+void
+vhd_bitmap_clear(vhd_context_t *ctx, char *map, uint32_t block)
+{
+       if (vhd_creator_tapdisk(ctx) &&
+           ctx->footer.crtr_ver == 0x00000001)
+               return old_clear_bit(map, block);
+
+       return clear_bit(map, block);
+}
+
+/*
+ * returns absolute offset of the first 
+ * byte of the file which is not vhd metadata
+ */
+int
+vhd_end_of_headers(vhd_context_t *ctx, off_t *end)
+{
+       int err, i, n;
+       uint32_t bat_bytes;
+       off_t eom, bat_end;
+       vhd_parent_locator_t *loc;
+
+       *end = 0;
+
+       if (!vhd_type_dynamic(ctx))
+               return 0;
+
+       eom       = ctx->footer.data_offset + sizeof(vhd_header_t);
+
+       bat_bytes = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+       bat_end   = ctx->header.table_offset + bat_bytes;
+
+       eom       = MAX(eom, bat_end);
+
+       if (vhd_has_batmap(ctx)) {
+               off_t hdr_end, hdr_secs, map_end, map_secs;
+
+               err = vhd_get_batmap(ctx);
+               if (err)
+                       return err;
+
+               hdr_secs = secs_round_up_no_zero(sizeof(vhd_batmap_header_t));
+               err      = vhd_batmap_header_offset(ctx, &hdr_end);
+               if (err)
+                       return err;
+
+               hdr_end += vhd_sectors_to_bytes(hdr_secs);
+               eom      = MAX(eom, hdr_end);
+
+               map_secs = ctx->batmap.header.batmap_size;
+               map_end  = (ctx->batmap.header.batmap_offset +
+                           vhd_sectors_to_bytes(map_secs));
+               eom      = MAX(eom, map_end);
+       }
+
+       /* parent locators */
+       n = sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t);
+
+       for (i = 0; i < n; i++) {
+               off_t loc_end;
+
+               loc = &ctx->header.loc[i];
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               loc_end = loc->data_offset + vhd_parent_locator_size(loc);
+               eom     = MAX(eom, loc_end);
+       }
+
+       *end = eom;
+       return 0;
+}
+
+int
+vhd_end_of_data(vhd_context_t *ctx, off_t *end)
+{
+       int i, err;
+       off_t max;
+       uint64_t blk;
+
+       if (!vhd_type_dynamic(ctx)) {
+               err = vhd_seek(ctx, 0, SEEK_END);
+               if (err)
+                       return err;
+
+               max = vhd_position(ctx);
+               if (max == (off_t)-1)
+                       return -errno;
+
+               *end = max - sizeof(vhd_footer_t);
+               return 0;
+       }
+
+       err = vhd_end_of_headers(ctx, &max);
+       if (err)
+               return err;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       max >>= VHD_SECTOR_SHIFT;
+
+       for (i = 0; i < ctx->bat.entries; i++) {
+               blk = ctx->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       blk += ctx->spb + ctx->bm_secs;
+                       max  = MAX(blk, max);
+               }
+       }
+
+       *end = vhd_sectors_to_bytes(max);
+       return 0;
+}
+
+uint32_t inline
+vhd_time(time_t time)
+{
+       return (uint32_t)(time - VHD_EPOCH_START);
+}
+
+/* 
+ * Stringify the VHD timestamp for printing.
+ * As with ctime_r, target must be >=26 bytes.
+ */
+size_t 
+vhd_time_to_string(uint32_t timestamp, char *target)
+{
+       char *cr;
+       time_t unix_timestamp;
+
+       unix_timestamp = (time_t)timestamp + VHD_EPOCH_START;
+       ctime_r(&unix_timestamp, target);
+
+       /* handle mad ctime_r newline appending. */
+       if ((cr = strchr(target, '\n')) != NULL)
+               *cr = '\0';
+
+       return (strlen(target));
+}
+
+/*
+ * nabbed from vhd specs.
+ */
+uint32_t
+vhd_chs(uint64_t size)
+{
+       uint32_t secs, cylinders, heads, spt, cth;
+
+       secs = secs_round_up_no_zero(size);
+
+       if (secs > 65535 * 16 * 255)
+               secs = 65535 * 16 * 255;
+
+       if (secs >= 65535 * 16 * 63) {
+               spt   = 255;
+               cth   = secs / spt;
+               heads = 16;
+       } else {
+               spt   = 17;
+               cth   = secs / spt;
+               heads = (cth + 1023) / 1024;
+
+               if (heads < 4)
+                       heads = 4;
+
+               if (cth >= (heads * 1024) || heads > 16) {
+                       spt   = 31;
+                       cth   = secs / spt;
+                       heads = 16;
+               }
+
+               if (cth >= heads * 1024) {
+                       spt   = 63;
+                       cth   = secs / spt;
+                       heads = 16;
+               }
+       }
+
+       cylinders = cth / heads;
+
+       return GEOM_ENCODE(cylinders, heads, spt);
+}
+
+int
+vhd_get_footer(vhd_context_t *ctx)
+{
+       if (!vhd_validate_footer(&ctx->footer))
+               return 0;
+
+       return vhd_read_footer(ctx, &ctx->footer);
+}
+
+int
+vhd_get_header(vhd_context_t *ctx)
+{
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       if (!vhd_validate_header(&ctx->header))
+               return 0;
+
+       return vhd_read_header(ctx, &ctx->header);
+}
+
+int
+vhd_get_bat(vhd_context_t *ctx)
+{
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       if (!vhd_validate_bat(&ctx->bat))
+               return 0;
+
+       vhd_put_bat(ctx);
+       return vhd_read_bat(ctx, &ctx->bat);
+}
+
+int
+vhd_get_batmap(vhd_context_t *ctx)
+{
+       if (!vhd_has_batmap(ctx))
+               return -EINVAL;
+
+       if (!vhd_validate_batmap(&ctx->batmap))
+               return 0;
+
+       vhd_put_batmap(ctx);
+       return vhd_read_batmap(ctx, &ctx->batmap);
+}
+
+void
+vhd_put_footer(vhd_context_t *ctx)
+{
+       memset(&ctx->footer, 0, sizeof(vhd_footer_t));
+}
+
+void
+vhd_put_header(vhd_context_t *ctx)
+{
+       memset(&ctx->header, 0, sizeof(vhd_header_t));
+}
+
+void
+vhd_put_bat(vhd_context_t *ctx)
+{
+       if (!vhd_type_dynamic(ctx))
+               return;
+
+       free(ctx->bat.bat);
+       memset(&ctx->bat, 0, sizeof(vhd_bat_t));
+}
+
+void
+vhd_put_batmap(vhd_context_t *ctx)
+{
+       if (!vhd_type_dynamic(ctx))
+               return;
+
+       if (!vhd_has_batmap(ctx))
+               return;
+
+       free(ctx->batmap.map);
+       memset(&ctx->batmap, 0, sizeof(vhd_batmap_t));
+}
+
+/*
+ * look for 511 byte footer at end of file
+ */
+int
+vhd_read_short_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+       int err;
+       char *buf;
+       off_t eof;
+
+       buf = NULL;
+
+       err = vhd_seek(ctx, 0, SEEK_END);
+       if (err)
+               goto out;
+
+       eof = vhd_position(ctx);
+       if (eof == (off_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       err = vhd_seek(ctx, eof - 511, SEEK_SET);
+       if (err)
+               goto out;
+
+       err = posix_memalign((void **)&buf,
+                            VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto out;
+       }
+
+       memset(buf, 0, sizeof(vhd_footer_t));
+
+       /*
+        * expecting short read here
+        */
+       vhd_read(ctx, buf, sizeof(vhd_footer_t));
+
+       memcpy(footer, buf, sizeof(vhd_footer_t));
+
+       vhd_footer_in(footer);
+       err = vhd_validate_footer(footer);
+
+out:
+       if (err)
+               VHDLOG("%s: failed reading short footer: %d\n",
+                      ctx->file, err);
+       free(buf);
+       return err;
+}
+
+int
+vhd_read_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off_t off)
+{
+       int err;
+       char *buf;
+
+       buf = NULL;
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err = posix_memalign((void **)&buf,
+                            VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto out;
+       }
+
+       err = vhd_read(ctx, buf, sizeof(vhd_footer_t));
+       if (err)
+               goto out;
+
+       memcpy(footer, buf, sizeof(vhd_footer_t));
+
+       vhd_footer_in(footer);
+       err = vhd_validate_footer(footer);
+
+out:
+       if (err)
+               VHDLOG("%s: reading footer at 0x%08"PRIx64" failed: %d\n",
+                      ctx->file, off, err);
+       free(buf);
+       return err;
+}
+
+int
+vhd_read_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+       int err;
+       off_t off;
+
+       err = vhd_seek(ctx, 0, SEEK_END);
+       if (err)
+               return err;
+
+       off = vhd_position(ctx);
+       if (off == (off_t)-1)
+               return -errno;
+
+       err = vhd_read_footer_at(ctx, footer, off - 512);
+       if (err != -EINVAL)
+               return err;
+
+       err = vhd_read_short_footer(ctx, footer);
+       if (err != -EINVAL)
+               return err;
+
+       if (ctx->oflags & VHD_OPEN_STRICT)
+               return -EINVAL;
+
+       return vhd_read_footer_at(ctx, footer, 0);
+}
+
+int
+vhd_read_header_at(vhd_context_t *ctx, vhd_header_t *header, off_t off)
+{
+       int err;
+       char *buf;
+
+       buf = NULL;
+
+       if (!vhd_type_dynamic(ctx)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err = posix_memalign((void **)&buf,
+                            VHD_SECTOR_SIZE, sizeof(vhd_header_t));
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto out;
+       }
+
+       err = vhd_read(ctx, buf, sizeof(vhd_header_t));
+       if (err)
+               goto out;
+
+       memcpy(header, buf, sizeof(vhd_header_t));
+
+       vhd_header_in(header);
+       err = vhd_validate_header(header);
+
+out:
+       if (err)
+               VHDLOG("%s: reading header at 0x%08"PRIx64" failed: %d\n",
+                      ctx->file, off, err);
+       free(buf);
+       return err;
+}
+
+int
+vhd_read_header(vhd_context_t *ctx, vhd_header_t *header)
+{
+       int err;
+       off_t off;
+
+       if (!vhd_type_dynamic(ctx)) {
+               VHDLOG("%s is not dynamic!\n", ctx->file);
+               return -EINVAL;
+       }
+
+       off = ctx->footer.data_offset;
+       return vhd_read_header_at(ctx, header, off);
+}
+
+int
+vhd_read_bat(vhd_context_t *ctx, vhd_bat_t *bat)
+{
+       int err;
+       char *buf;
+       off_t off;
+       size_t size;
+
+       buf  = NULL;
+
+       if (!vhd_type_dynamic(ctx)) {
+               err = -EINVAL;
+               goto fail;
+       }
+
+       off  = ctx->header.table_offset;
+       size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto fail;
+       }
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto fail;
+
+       err = vhd_read(ctx, buf, size);
+       if (err)
+               goto fail;
+
+       bat->spb     = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+       bat->entries = ctx->header.max_bat_size;
+       bat->bat     = (uint32_t *)buf;
+
+       vhd_bat_in(bat);
+
+       return 0;
+
+fail:
+       free(buf);
+       memset(bat, 0, sizeof(vhd_bat_t));
+       VHDLOG("%s: failed to read bat: %d\n", ctx->file, err);
+       return err;
+}
+
+static int
+vhd_read_batmap_header(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+       int err;
+       char *buf;
+       off_t off;
+       size_t size;
+
+       buf = NULL;
+
+       err = vhd_batmap_header_offset(ctx, &off);
+       if (err)
+               goto fail;
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto fail;
+
+       size = vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto fail;
+       }
+
+       err = vhd_read(ctx, buf, size);
+       if (err)
+               goto fail;
+
+       memcpy(&batmap->header, buf, sizeof(vhd_batmap_header_t));
+       free(buf);
+       buf = NULL;
+
+       vhd_batmap_header_in(batmap);
+
+       return 0;
+
+fail:
+       free(buf);
+       memset(&batmap->header, 0, sizeof(vhd_batmap_header_t));
+       VHDLOG("%s: failed to read batmap header: %d\n", ctx->file, err);
+       return err;
+}
+
+static int
+vhd_read_batmap_map(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+       int err;
+       char *buf;
+       off_t off;
+       size_t map_size;
+
+       map_size = vhd_sectors_to_bytes(batmap->header.batmap_size);
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, map_size);
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto fail;
+       }
+
+       off  = batmap->header.batmap_offset;
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto fail;
+
+       err  = vhd_read(ctx, buf, map_size);
+       if (err)
+               goto fail;
+
+       batmap->map = buf;
+       return 0;
+
+fail:
+       free(buf);
+       batmap->map = NULL;
+       VHDLOG("%s: failed to read batmap: %d\n", ctx->file, err);
+       return err;
+}
+
+int
+vhd_read_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+       int err;
+
+       if (!vhd_has_batmap(ctx))
+               return -EINVAL;
+
+       memset(batmap, 0, sizeof(vhd_batmap_t));
+
+       err = vhd_read_batmap_header(ctx, batmap);
+       if (err)
+               return err;
+
+       err = vhd_validate_batmap_header(batmap);
+       if (err)
+               return err;
+
+       err = vhd_read_batmap_map(ctx, batmap);
+       if (err)
+               return err;
+
+       err = vhd_validate_batmap(batmap);
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       free(batmap->map);
+       memset(batmap, 0, sizeof(vhd_batmap_t));
+       return err;
+}
+
+int
+vhd_has_batmap(vhd_context_t *ctx)
+{
+       if (!vhd_type_dynamic(ctx))
+               return 0;
+
+       if (!vhd_creator_tapdisk(ctx))
+               return 0;
+
+       if (ctx->footer.crtr_ver <= VHD_VERSION(0, 1))
+               return 0;
+
+       if (ctx->footer.crtr_ver >= VHD_VERSION(1, 2))
+               return 1;
+
+       /*
+        * VHDs of version 1.1 probably have a batmap, but may not 
+        * if they were updated from version 0.1 via vhd-update.
+        */
+       if (!vhd_validate_batmap_header(&ctx->batmap))
+               return 1;
+
+       if (vhd_read_batmap_header(ctx, &ctx->batmap))
+               return 0;
+
+       return (!vhd_validate_batmap_header(&ctx->batmap));
+}
+
+/* 
+ * Is this a block device (with a fixed size)? This affects whether the file 
+ * can be truncated and where the footer is written for VHDs.
+ */
+int
+vhd_test_file_fixed(const char *file, int *is_block)
+{
+       int err;
+       struct stat stats;
+
+       err = stat(file, &stats);
+       if (err == -1)
+               return -errno;
+
+       *is_block = !!(S_ISBLK(stats.st_mode));
+       return err;
+}
+
+int
+vhd_find_parent(vhd_context_t *ctx, const char *parent, char **_location)
+{
+       int err;
+       char *location, *cpath, *cdir, *path;
+
+       err        = 0;
+       path       = NULL;
+       cpath      = NULL;
+       location   = NULL;
+       *_location = NULL;
+
+       if (!parent)
+               return -EINVAL;
+
+       if (parent[0] == '/') {
+               if (!access(parent, R_OK)) {
+                       path = strdup(parent);
+                       if (!path)
+                               return -ENOMEM;
+                       *_location = path;
+                       return 0;
+               }
+       }
+
+       /* check parent path relative to child's directory */
+       cpath = realpath(ctx->file, NULL);
+       if (!cpath) {
+               err = -errno;
+               goto out;
+       }
+
+       cdir = dirname(cpath);
+       if (asprintf(&location, "%s/%s", cdir, parent) == -1) {
+               err = -errno;
+               location = NULL;
+               goto out;
+       }
+
+       if (!access(location, R_OK)) {
+               path = realpath(location, NULL);
+               if (path) {
+                       *_location = path;
+                       return 0;
+               }
+       }
+       err = -errno;
+
+out:
+       free(location);
+       free(cpath);
+       return err;
+}
+
+static int 
+vhd_macx_encode_location(char *name, char **out, int *outlen)
+{
+       iconv_t cd;
+       int len, err;
+       size_t ibl, obl;
+       char *uri, *uri_utf8, *uri_utf8p, *ret;
+       const char *urip;
+       char *codeset;
+
+       err     = 0;
+       ret     = NULL;
+       *out    = NULL;
+       *outlen = 0;
+       len     = strlen(name) + strlen("file://");
+
+       ibl     = len;
+       obl     = len * 2;
+
+       urip = uri = malloc(ibl + 1);
+       uri_utf8 = uri_utf8p = malloc(obl);
+
+       if (!uri || !uri_utf8)
+               return -ENOMEM;
+
+       codeset = nl_langinfo(CODESET);
+       cd = iconv_open("UTF-8", codeset);
+       if (cd == (iconv_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       snprintf(uri, ibl+1, "file://%s", name);
+
+       if (iconv(cd,
+#ifdef __linux__
+           (char **)
+#endif
+           &urip, &ibl, &uri_utf8p, &obl) == (size_t)-1 ||
+           ibl) {
+               err = (errno ? -errno : -EIO);
+               goto out;
+       }
+
+       ret = malloc(len);
+       if (!ret) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       memcpy(ret, uri_utf8, len);
+       *outlen = len;
+       *out    = ret;
+
+ out:
+       free(uri);
+       free(uri_utf8);
+       if (cd != (iconv_t)-1)
+               iconv_close(cd);
+
+       return err;
+}
+
+static int
+vhd_w2u_encode_location(char *name, char **out, int *outlen)
+{
+       iconv_t cd;
+       int len, err;
+       size_t ibl, obl;
+       char *uri, *uri_utf16, *uri_utf16p, *tmp, *ret;
+       const char *urip;
+       char *codeset;
+
+       err     = 0;
+       ret     = NULL;
+       *out    = NULL;
+       *outlen = 0;
+       cd      = (iconv_t) -1;
+
+       /* 
+        * MICROSOFT_COMPAT
+        * relative paths must start with ".\" 
+        */
+       if (name[0] != '/') {
+               tmp = strstr(name, "./");
+               if (tmp == name)
+                       tmp += strlen("./");
+               else
+                       tmp = name;
+
+               err = asprintf(&uri, ".\\%s", tmp);
+       } else
+               err = asprintf(&uri, "%s", name);
+
+       if (err == -1)
+               return -ENOMEM;
+
+       tmp = uri;
+       while (*tmp != '\0') {
+               if (*tmp == '/')
+                       *tmp = '\\';
+               tmp++;
+       }
+
+       len  = strlen(uri);
+       ibl  = len;
+       obl  = len * 2;
+       urip = uri;
+
+       uri_utf16 = uri_utf16p = malloc(obl);
+       if (!uri_utf16) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       /* 
+        * MICROSOFT_COMPAT
+        * little endian unicode here 
+        */
+       codeset = nl_langinfo(CODESET);
+       cd = iconv_open("UTF-16LE", codeset);
+       if (cd == (iconv_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       if (iconv(cd,
+#ifdef __linux__
+           (char **)
+#endif
+           &urip, &ibl, &uri_utf16p, &obl) == (size_t)-1 ||
+           ibl) {
+               err = (errno ? -errno : -EIO);
+               goto out;
+       }
+
+       len = len * 2;
+       ret = malloc(len);
+       if (!ret) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       memcpy(ret, uri_utf16, len);
+       *outlen = len;
+       *out    = ret;
+       err     = 0;
+
+ out:
+       free(uri);
+       free(uri_utf16);
+       if (cd != (iconv_t)-1)
+               iconv_close(cd);
+
+       return err;
+}
+
+static char *
+vhd_macx_decode_location(const char *in, char *out, int len)
+{
+       iconv_t cd;
+       char *name;
+       size_t ibl, obl;
+       char *codeset;
+
+       name = out;
+       ibl  = obl = len;
+
+       codeset = nl_langinfo(CODESET);
+       cd = iconv_open(codeset, "UTF-8");
+       if (cd == (iconv_t)-1) 
+               return NULL;
+
+       if (iconv(cd,
+#ifdef __linux__
+               (char **)
+#endif
+               &in, &ibl, &out, &obl) == (size_t)-1 || ibl)
+               return NULL;
+
+       iconv_close(cd);
+       *out = '\0';
+
+       if (strstr(name, "file://") != name)
+               return NULL;
+
+       name += strlen("file://");
+
+       return strdup(name);
+}
+
+static char *
+vhd_w2u_decode_location(const char *in, char *out, int len, char *utf_type)
+{
+       iconv_t cd;
+       char *name, *tmp;
+       size_t ibl, obl;
+       char *codeset;
+
+       tmp = name = out;
+       ibl = obl  = len;
+
+       codeset = nl_langinfo(CODESET);
+       cd = iconv_open(codeset, utf_type);
+       if (cd == (iconv_t)-1) 
+               return NULL;
+
+       if (iconv(cd,
+#ifdef __linux__
+               (char **)
+#endif
+               &in, &ibl, &out, &obl) == (size_t)-1 || ibl)
+               return NULL;
+
+       iconv_close(cd);
+       *out = '\0';
+
+       /* TODO: spaces */
+       while (tmp != out) {
+               if (*tmp == '\\')
+                       *tmp = '/';
+               tmp++;
+       }
+
+       if (strstr(name, "C:") == name || strstr(name, "c:") == name)
+               name += strlen("c:");
+
+       return strdup(name);
+}
+
+int
+vhd_header_decode_parent(vhd_context_t *ctx, vhd_header_t *header, char **buf)
+{
+       char *code, out[512];
+
+       if (vhd_creator_tapdisk(ctx) &&
+           ctx->footer.crtr_ver == VHD_VERSION(0, 1))
+               code = UTF_16;
+       else
+               code = UTF_16BE;
+
+       *buf = vhd_w2u_decode_location(header->prt_name, out, 512, code);
+       return (*buf == NULL ? -EINVAL : 0);
+}
+
+int
+vhd_parent_locator_read(vhd_context_t *ctx,
+                       vhd_parent_locator_t *loc, char **parent)
+{
+       int err, size;
+       char *raw, *out, *name;
+
+       raw     = NULL;
+       out     = NULL;
+       name    = NULL;
+       *parent = NULL;
+
+       if (ctx->footer.type != HD_TYPE_DIFF) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       switch (loc->code) {
+       case PLAT_CODE_MACX:
+       case PLAT_CODE_W2KU:
+       case PLAT_CODE_W2RU:
+               break;
+       default:
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = vhd_seek(ctx, loc->data_offset, SEEK_SET);
+       if (err)
+               goto out;
+
+       size = vhd_parent_locator_size(loc);
+       if (size <= 0) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = posix_memalign((void **)&raw, VHD_SECTOR_SIZE, size);
+       if (err) {
+               raw = NULL;
+               err = -err;
+               goto out;
+       }
+
+       err = vhd_read(ctx, raw, size);
+       if (err)
+               goto out;
+
+       out = malloc(loc->data_len + 1);
+       if (!out) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       switch (loc->code) {
+       case PLAT_CODE_MACX:
+               name = vhd_macx_decode_location(raw, out, loc->data_len);
+               break;
+       case PLAT_CODE_W2KU:
+       case PLAT_CODE_W2RU:
+               name = vhd_w2u_decode_location(raw, out,
+                                              loc->data_len, UTF_16LE);
+               break;
+       }
+
+       if (!name) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err     = 0;
+       *parent = name;
+
+out:
+       free(raw);
+       free(out);
+
+       if (err) {
+               VHDLOG("%s: error reading parent locator: %d\n",
+                      ctx->file, err);
+               VHDLOG("%s: locator: code %u, space 0x%x, len 0x%x, "
+                      "off 0x%"PRIx64"\n", ctx->file, loc->code, loc->data_space,
+                      loc->data_len, loc->data_offset);
+       }
+
+       return err;
+}
+
+int
+vhd_parent_locator_get(vhd_context_t *ctx, char **parent)
+{
+       int i, n, err;
+       char *name, *location;
+       vhd_parent_locator_t *loc;
+
+       err     = 0;
+       *parent = NULL;
+
+       if (ctx->footer.type != HD_TYPE_DIFF)
+               return -EINVAL;
+
+       n = vhd_parent_locator_count(ctx);
+       for (i = 0; i < n; i++) {
+               loc = ctx->header.loc + i;
+               err = vhd_parent_locator_read(ctx, loc, &name);
+               if (err)
+                       continue;
+
+               err = vhd_find_parent(ctx, name, &location);
+               if (err)
+                       VHDLOG("%s: couldn't find parent %s (%d)\n",
+                              ctx->file, name, err);
+               free(name);
+
+               if (!err) {
+                       *parent = location;
+                       return 0;
+               }
+       }
+
+       return err;
+}
+
+int
+vhd_parent_locator_write_at(vhd_context_t *ctx,
+                           const char *parent, off_t off, uint32_t code,
+                           size_t max_bytes, vhd_parent_locator_t *loc)
+{
+       struct stat stats;
+       int err, len, size;
+       char *absolute_path, *relative_path, *encoded, *block;
+
+       memset(loc, 0, sizeof(vhd_parent_locator_t));
+
+       if (ctx->footer.type != HD_TYPE_DIFF)
+               return -EINVAL;
+
+       absolute_path = NULL;
+       relative_path = NULL;
+       encoded       = NULL;
+       block         = NULL;
+       size          = 0;
+       len           = 0;
+
+       switch (code) {
+       case PLAT_CODE_MACX:
+       case PLAT_CODE_W2KU:
+       case PLAT_CODE_W2RU:
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       absolute_path = realpath(parent, NULL);
+       if (!absolute_path) {
+               err = -errno;
+               goto out;
+       }
+
+       err = stat(absolute_path, &stats);
+       if (err) {
+               err = -errno;
+               goto out;
+       }
+
+       if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       relative_path = relative_path_to(ctx->file, absolute_path, &err);
+       if (!relative_path || err) {
+               err = (err ? err : -EINVAL);
+               goto out;
+       }
+
+       switch (code) {
+       case PLAT_CODE_MACX:
+               err = vhd_macx_encode_location(relative_path, &encoded, &len);
+               break;
+       case PLAT_CODE_W2KU:
+       case PLAT_CODE_W2RU:
+               err = vhd_w2u_encode_location(relative_path, &encoded, &len);
+               break;
+       default:
+               err = -EINVAL;
+       }
+
+       if (err)
+               goto out;
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       size = vhd_bytes_padded(len);
+
+       if (max_bytes && size > max_bytes) {
+               err = -ENAMETOOLONG;
+               goto out;
+       }
+
+       err  = posix_memalign((void **)&block, VHD_SECTOR_SIZE, size);
+       if (err) {
+               block = NULL;
+               err   = -err;
+               goto out;
+       }
+
+       memset(block, 0, size);
+       memcpy(block, encoded, len);
+
+       err = vhd_write(ctx, block, size);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       free(absolute_path);
+       free(relative_path);
+       free(encoded);
+       free(block);
+
+       if (!err) {
+               loc->res         = 0;
+               loc->code        = code;
+               loc->data_len    = len;
+               /*
+                * write number of bytes ('size') instead of number of sectors
+                * into loc->data_space to be compatible with MSFT, even though
+                * this goes against the specs
+                */
+               loc->data_space  = size; 
+               loc->data_offset = off;
+       }
+
+       return err;
+}
+
+static int
+vhd_footer_offset_at_eof(vhd_context_t *ctx, off_t *off)
+{
+       int err;
+       if ((err = vhd_seek(ctx, 0, SEEK_END)))
+               return errno;
+       *off = vhd_position(ctx) - sizeof(vhd_footer_t);
+       return 0;
+}
+
+int
+vhd_read_bitmap(vhd_context_t *ctx, uint32_t block, char **bufp)
+{
+       int err;
+       char *buf;
+       size_t size;
+       off_t off;
+       uint64_t blk;
+
+       buf   = NULL;
+       *bufp = NULL;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       if (block >= ctx->bat.entries)
+               return -ERANGE;
+
+       blk  = ctx->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return -EINVAL;
+
+       off  = vhd_sectors_to_bytes(blk);
+       size = vhd_bytes_padded(ctx->spb >> 3);
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err)
+               return -err;
+
+       err  = vhd_read(ctx, buf, size);
+       if (err)
+               goto fail;
+
+       *bufp = buf;
+       return 0;
+
+fail:
+       free(buf);
+       return err;
+}
+
+int
+vhd_read_block(vhd_context_t *ctx, uint32_t block, char **bufp)
+{
+       int err;
+       char *buf;
+       size_t size;
+       uint64_t blk;
+       off_t end, off;
+
+       buf   = NULL;
+       *bufp = NULL;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       if (block >= ctx->bat.entries)
+               return -ERANGE;
+
+       blk  = ctx->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return -EINVAL;
+
+       off  = vhd_sectors_to_bytes(blk + ctx->bm_secs);
+       size = vhd_sectors_to_bytes(ctx->spb);
+
+       err  = vhd_footer_offset_at_eof(ctx, &end);
+       if (err)
+               return err;
+
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               err = -err;
+               goto fail;
+       }
+
+       if (end < off + ctx->header.block_size) {
+               size = end - off;
+               memset(buf + size, 0, ctx->header.block_size - size);
+       }
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto fail;
+
+       err  = vhd_read(ctx, buf, size);
+       if (err)
+               goto fail;
+
+       *bufp = buf;
+       return 0;
+
+fail:
+       free(buf);
+       return err;
+}
+
+int
+vhd_write_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off_t off)
+{
+       int err;
+       vhd_footer_t *f;
+
+       f = NULL;
+
+       err = posix_memalign((void **)&f,
+                            VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+       if (err) {
+               f   = NULL;
+               err = -err;
+               goto out;
+       }
+
+       memcpy(f, footer, sizeof(vhd_footer_t));
+       f->checksum = vhd_checksum_footer(f);
+
+       err = vhd_validate_footer(f);
+       if (err)
+               goto out;
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       vhd_footer_out(f);
+
+       err = vhd_write(ctx, f, sizeof(vhd_footer_t));
+
+out:
+       if (err)
+               VHDLOG("%s: failed writing footer at 0x%08"PRIx64": %d\n",
+                      ctx->file, off, err);
+       free(f);
+       return err;
+}
+
+int
+vhd_write_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+       int err;
+       off_t off;
+
+       if (ctx->is_block)
+               err = vhd_footer_offset_at_eof(ctx, &off);
+       else
+               err = vhd_end_of_data(ctx, &off);
+       if (err)
+               return err;
+
+       err = vhd_write_footer_at(ctx, footer, off);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(ctx))
+               return 0;
+
+       return vhd_write_footer_at(ctx, footer, 0);
+}
+
+int
+vhd_write_header_at(vhd_context_t *ctx, vhd_header_t *header, off_t off)
+{
+       int err;
+       vhd_header_t *h;
+
+       h = NULL;
+
+       if (!vhd_type_dynamic(ctx)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = posix_memalign((void **)&h,
+                            VHD_SECTOR_SIZE, sizeof(vhd_header_t));
+       if (err) {
+               h   = NULL;
+               err = -err;
+               goto out;
+       }
+
+       memcpy(h, header, sizeof(vhd_header_t));
+
+       h->checksum = vhd_checksum_header(h);
+       err = vhd_validate_header(h);
+       if (err)
+               goto out;
+
+       vhd_header_out(h);
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err = vhd_write(ctx, h, sizeof(vhd_header_t));
+
+out:
+       if (err)
+               VHDLOG("%s: failed writing header at 0x%08"PRIx64": %d\n",
+                      ctx->file, off, err);
+       free(h);
+       return err;
+}
+
+int
+vhd_write_header(vhd_context_t *ctx, vhd_header_t *header)
+{
+       int err;
+       off_t off;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       off = ctx->footer.data_offset;
+       return vhd_write_header_at(ctx, header, off);
+}
+
+int
+vhd_write_bat(vhd_context_t *ctx, vhd_bat_t *bat)
+{
+       int err;
+       off_t off;
+       vhd_bat_t b;
+       size_t size;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       err = vhd_validate_bat(&ctx->bat);
+       if (err)
+               return err;
+
+       err = vhd_validate_bat(bat);
+       if (err)
+               return err;
+
+       memset(&b, 0, sizeof(vhd_bat_t));
+
+       off  = ctx->header.table_offset;
+       size = vhd_bytes_padded(bat->entries * sizeof(uint32_t));
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = posix_memalign((void **)&b.bat, VHD_SECTOR_SIZE, size);
+       if (err)
+               return -err;
+
+       memcpy(b.bat, bat->bat, size);
+       b.spb     = bat->spb;
+       b.entries = bat->entries;
+       vhd_bat_out(&b);
+
+       err = vhd_write(ctx, b.bat, size);
+       free(b.bat);
+
+       return err;
+}
+
+int
+vhd_write_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+       int err;
+       off_t off;
+       vhd_batmap_t b;
+       char *buf, *map;
+       size_t size, map_size;
+
+       buf      = NULL;
+       map      = NULL;
+
+       if (!vhd_has_batmap(ctx)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       b.header = batmap->header;
+       b.map    = batmap->map;
+
+       b.header.checksum = vhd_checksum_batmap(&b);
+       err = vhd_validate_batmap(&b);
+       if (err)
+               goto out;
+
+       off      = b.header.batmap_offset;
+       map_size = vhd_sectors_to_bytes(b.header.batmap_size);
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err  = posix_memalign((void **)&map, VHD_SECTOR_SIZE, map_size);
+       if (err) {
+               map = NULL;
+               err = -err;
+               goto out;
+       }
+
+       memcpy(map, b.map, map_size);
+
+       err  = vhd_write(ctx, map, map_size);
+       if (err)
+               goto out;
+
+       err  = vhd_batmap_header_offset(ctx, &off);
+       if (err)
+               goto out;
+
+       size = vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               err = -err;
+               buf = NULL;
+               goto out;
+       }
+
+       vhd_batmap_header_out(&b);
+       memset(buf, 0, size);
+       memcpy(buf, &b.header, sizeof(vhd_batmap_header_t));
+
+       err  = vhd_write(ctx, buf, size);
+
+out:
+       if (err)
+               VHDLOG("%s: failed writing batmap: %d\n", ctx->file, err);
+       free(buf);
+       free(map);
+       return 0;
+}
+
+int
+vhd_write_bitmap(vhd_context_t *ctx, uint32_t block, char *bitmap)
+{
+       int err;
+       off_t off;
+       uint64_t blk;
+       size_t secs, size;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       err = vhd_validate_bat(&ctx->bat);
+       if (err)
+               return err;
+
+       if (block >= ctx->bat.entries)
+               return -ERANGE;
+
+       if ((unsigned long)bitmap & (VHD_SECTOR_SIZE - 1))
+               return -EINVAL;
+
+       blk  = ctx->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return -EINVAL;
+
+       off  = vhd_sectors_to_bytes(blk);
+       size = vhd_sectors_to_bytes(ctx->bm_secs);
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = vhd_write(ctx, bitmap, size);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+int
+vhd_write_block(vhd_context_t *ctx, uint32_t block, char *data)
+{
+       int err;
+       off_t off;
+       size_t size;
+       uint64_t blk;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       err = vhd_validate_bat(&ctx->bat);
+       if (err)
+               return err;
+
+       if (block >= ctx->bat.entries)
+               return -ERANGE;
+
+       if ((unsigned long)data & (VHD_SECTOR_SIZE -1))
+               return -EINVAL;
+
+       blk  = ctx->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return -EINVAL;
+
+       off  = vhd_sectors_to_bytes(blk + ctx->bm_secs);
+       size = vhd_sectors_to_bytes(ctx->spb);
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = vhd_write(ctx, data, size);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static inline int
+namedup(char **dup, const char *name)
+{
+       *dup = NULL;
+
+       if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN)
+               return -ENAMETOOLONG;
+       
+       *dup = strdup(name);
+       if (*dup == NULL)
+               return -ENOMEM;
+
+       return 0;
+}
+
+int
+vhd_seek(vhd_context_t *ctx, off_t offset, int whence)
+{
+       off_t off;
+
+       off = lseek(ctx->fd, offset, whence);
+       if (off == (off_t)-1) {
+               VHDLOG("%s: seek(0x%08"PRIx64", %d) failed: %d\n",
+                      ctx->file, offset, whence, -errno);
+               return -errno;
+       }
+
+       return 0;
+}
+
+off_t
+vhd_position(vhd_context_t *ctx)
+{
+       return lseek(ctx->fd, 0, SEEK_CUR);
+}
+
+int
+vhd_read(vhd_context_t *ctx, void *buf, size_t size)
+{
+       size_t ret;
+
+       errno = 0;
+
+       ret = read(ctx->fd, buf, size);
+       if (ret == size)
+               return 0;
+
+       VHDLOG("%s: read of %zu returned %zd, errno: %d\n",
+              ctx->file, size, ret, -errno);
+
+       return (errno ? -errno : -EIO);
+}
+
+int
+vhd_write(vhd_context_t *ctx, void *buf, size_t size)
+{
+       size_t ret;
+
+       errno = 0;
+
+       ret = write(ctx->fd, buf, size);
+       if (ret == size)
+               return 0;
+
+       VHDLOG("%s: write of %zu returned %zd, errno: %d\n",
+              ctx->file, size, ret, -errno);
+
+       return (errno ? -errno : -EIO);
+}
+
+int
+vhd_offset(vhd_context_t *ctx, uint32_t sector, uint32_t *offset)
+{
+       int err;
+       uint32_t block;
+
+       if (!vhd_type_dynamic(ctx))
+               return sector;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       block = sector / ctx->spb;
+       if (ctx->bat.bat[block] == DD_BLK_UNUSED)
+               *offset = DD_BLK_UNUSED;
+       else
+               *offset = ctx->bat.bat[block] +
+                       ctx->bm_secs + (sector % ctx->spb);
+
+       return 0;
+}
+
+int
+vhd_open_fast(vhd_context_t *ctx)
+{
+       int err;
+       char *buf;
+       size_t size;
+
+       size = sizeof(vhd_footer_t) + sizeof(vhd_header_t);
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               VHDLOG("failed allocating %s: %d\n", ctx->file, -err);
+               return -err;
+       }
+
+       err = vhd_read(ctx, buf, size);
+       if (err) {
+               VHDLOG("failed reading %s: %d\n", ctx->file, err);
+               goto out;
+       }
+
+       memcpy(&ctx->footer, buf, sizeof(vhd_footer_t));
+       vhd_footer_in(&ctx->footer);
+       err = vhd_validate_footer(&ctx->footer);
+       if (err)
+               goto out;
+
+       if (vhd_type_dynamic(ctx)) {
+               if (ctx->footer.data_offset != sizeof(vhd_footer_t))
+                       err = vhd_read_header(ctx, &ctx->header);
+               else {
+                       memcpy(&ctx->header,
+                              buf + sizeof(vhd_footer_t),
+                              sizeof(vhd_header_t));
+                       vhd_header_in(&ctx->header);
+                       err = vhd_validate_header(&ctx->header);
+               }
+
+               if (err)
+                       goto out;
+
+               ctx->spb     = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+               ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3);
+       }
+
+out:
+       free(buf);
+       return err;
+}
+
+int
+vhd_open(vhd_context_t *ctx, const char *file, int flags)
+{
+       int err, oflags;
+
+       if (flags & VHD_OPEN_STRICT)
+               vhd_flag_clear(flags, VHD_OPEN_FAST);
+
+       memset(ctx, 0, sizeof(vhd_context_t));
+       ctx->fd     = -1;
+       ctx->oflags = flags;
+
+       err = namedup(&ctx->file, file);
+       if (err)
+               return err;
+
+       oflags = O_DIRECT | O_LARGEFILE;
+       if (flags & VHD_OPEN_RDONLY)
+               oflags |= O_RDONLY;
+       if (flags & VHD_OPEN_RDWR)
+               oflags |= O_RDWR;
+
+       ctx->fd = open(ctx->file, oflags, 0644);
+       if (ctx->fd == -1) {
+               err = -errno;
+               VHDLOG("failed to open %s: %d\n", ctx->file, err);
+               goto fail;
+       }
+
+       err = vhd_test_file_fixed(ctx->file, &ctx->is_block);
+       if (err)
+               goto fail;
+
+       if (flags & VHD_OPEN_FAST) {
+               err = vhd_open_fast(ctx);
+               if (err)
+                       goto fail;
+
+               return 0;
+       }
+
+       err = vhd_read_footer(ctx, &ctx->footer);
+       if (err)
+               goto fail;
+
+       if (!(flags & VHD_OPEN_IGNORE_DISABLED) && vhd_disabled(ctx)) {
+               err = -EINVAL;
+               goto fail;
+       }
+
+       if (vhd_type_dynamic(ctx)) {
+               err = vhd_read_header(ctx, &ctx->header);
+               if (err)
+                       goto fail;
+
+               ctx->spb     = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+               ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3);
+       }
+
+       return 0;
+
+fail:
+       if (ctx->fd != -1)
+               close(ctx->fd);
+       free(ctx->file);
+       memset(ctx, 0, sizeof(vhd_context_t));
+       return err;
+}
+
+void
+vhd_close(vhd_context_t *ctx)
+{
+       if (ctx->file)
+               close(ctx->fd);
+       free(ctx->file);
+       free(ctx->bat.bat);
+       free(ctx->batmap.map);
+       memset(ctx, 0, sizeof(vhd_context_t));
+}
+
+static inline void
+vhd_initialize_footer(vhd_context_t *ctx, int type, uint64_t size)
+{
+       memset(&ctx->footer, 0, sizeof(vhd_footer_t));
+       memcpy(ctx->footer.cookie, HD_COOKIE, sizeof(ctx->footer.cookie));
+       ctx->footer.features     = HD_RESERVED;
+       ctx->footer.ff_version   = HD_FF_VERSION;
+       ctx->footer.timestamp    = vhd_time(time(NULL));
+       ctx->footer.crtr_ver     = VHD_CURRENT_VERSION;
+       ctx->footer.crtr_os      = 0x00000000;
+       ctx->footer.orig_size    = size;
+       ctx->footer.curr_size    = size;
+       ctx->footer.geometry     = vhd_chs(size);
+       ctx->footer.type         = type;
+       ctx->footer.saved        = 0;
+       ctx->footer.data_offset  = 0xFFFFFFFFFFFFFFFF;
+       strcpy(ctx->footer.crtr_app, "tap");
+       vhd_uuid_generate(&ctx->footer.uuid);
+}
+
+static int
+vhd_initialize_header_parent_name(vhd_context_t *ctx, const char *parent_path)
+{
+       int err;
+       iconv_t cd;
+       size_t ibl, obl;
+       char *ppath, *dst;
+       const char *pname;
+       char *codeset;
+
+       err   = 0;
+       pname = NULL;
+       ppath = NULL;
+
+       /*
+        * MICROSOFT_COMPAT
+        * big endian unicode here 
+        */
+       codeset = nl_langinfo(CODESET);
+       cd = iconv_open(UTF_16BE, codeset);
+       if (cd == (iconv_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       ppath = strdup(parent_path);
+       if (!ppath) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       pname = basename(ppath);
+       if (!strcmp(pname, "")) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       ibl = strlen(pname);
+       obl = sizeof(ctx->header.prt_name);
+       dst = ctx->header.prt_name;
+
+       memset(dst, 0, obl);
+
+       if (iconv(cd,
+#ifdef __linux__
+               (char **)
+#endif
+               &pname, &ibl, &dst, &obl) == (size_t)-1 || ibl)
+               err = (errno ? -errno : -EINVAL);
+
+out:
+       iconv_close(cd);
+       free(ppath);
+       return err;
+}
+
+static off_t
+get_file_size(const char *name)
+{
+       int fd;
+       off_t end;
+
+       fd = open(name, O_LARGEFILE | O_RDONLY);
+       if (fd == -1) {
+               VHDLOG("unable to open '%s': %d\n", name, errno);
+               return -errno;
+       }
+       end = lseek(fd, 0, SEEK_END);
+       close(fd); 
+       return end;
+}
+
+static int
+vhd_initialize_header(vhd_context_t *ctx, const char *parent_path, 
+               uint64_t size, int raw)
+{
+       int err;
+       struct stat stats;
+       vhd_context_t parent;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       memset(&ctx->header, 0, sizeof(vhd_header_t));
+       memcpy(ctx->header.cookie, DD_COOKIE, sizeof(ctx->header.cookie));
+       ctx->header.data_offset  = (uint64_t)-1;
+       ctx->header.table_offset = VHD_SECTOR_SIZE * 3; /* 1 ftr + 2 hdr */
+       ctx->header.hdr_ver      = DD_VERSION;
+       ctx->header.block_size   = VHD_BLOCK_SIZE;
+       ctx->header.prt_ts       = 0;
+       ctx->header.res1         = 0;
+       ctx->header.max_bat_size = (ctx->footer.curr_size +
+                                   VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+
+       ctx->footer.data_offset  = VHD_SECTOR_SIZE;
+
+       if (ctx->footer.type == HD_TYPE_DYNAMIC)
+               return 0;
+
+       err = stat(parent_path, &stats);
+       if (err == -1)
+               return -errno;
+
+       if (raw) {
+               ctx->header.prt_ts = vhd_time(stats.st_mtime);
+               if (!size)
+                       size = get_file_size(parent_path);
+       }
+       else {
+               err = vhd_open(&parent, parent_path, VHD_OPEN_RDONLY);
+               if (err)
+                       return err;
+
+               ctx->header.prt_ts = vhd_time(stats.st_mtime);
+               vhd_uuid_copy(&ctx->header.prt_uuid, &parent.footer.uuid);
+               if (!size)
+                       size = parent.footer.curr_size;
+               vhd_close(&parent);
+       }
+       ctx->footer.orig_size    = size;
+       ctx->footer.curr_size    = size;
+       ctx->footer.geometry     = vhd_chs(size);
+       ctx->header.max_bat_size = 
+               (size + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+
+       return vhd_initialize_header_parent_name(ctx, parent_path);
+}
+
+static int
+vhd_write_parent_locators(vhd_context_t *ctx, const char *parent)
+{
+       int i, err;
+       off_t off;
+       uint32_t code;
+
+       code = PLAT_CODE_NONE;
+
+       if (ctx->footer.type != HD_TYPE_DIFF)
+               return -EINVAL;
+
+       off = ctx->batmap.header.batmap_offset + 
+               vhd_sectors_to_bytes(ctx->batmap.header.batmap_size);
+       if (off & (VHD_SECTOR_SIZE - 1))
+               off = vhd_bytes_padded(off);
+
+       for (i = 0; i < 3; i++) {
+               switch (i) {
+               case 0:
+                       code = PLAT_CODE_MACX;
+                       break;
+               case 1:
+                       code = PLAT_CODE_W2KU;
+                       break;
+               case 2:
+                       code = PLAT_CODE_W2RU;
+                       break;
+               }
+
+               err = vhd_parent_locator_write_at(ctx, parent, off, code,
+                                                 0, ctx->header.loc + i);
+               if (err)
+                       return err;
+
+               off += vhd_parent_locator_size(ctx->header.loc + i);
+       }
+
+       return 0;
+}
+
+int
+vhd_change_parent(vhd_context_t *child, char *parent_path, int raw)
+{
+       int i, err;
+       char *ppath;
+       struct stat stats;
+       vhd_context_t parent;
+
+       ppath = realpath(parent_path, NULL);
+       if (!ppath) {
+               VHDLOG("error resolving parent path %s for %s: %d\n",
+                      parent_path, child->file, errno);
+               return -errno;
+       }
+
+       err = stat(ppath, &stats);
+       if (err == -1) {
+               err = -errno;
+               goto out;
+       }
+
+       if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (raw) {
+               vhd_uuid_clear(&child->header.prt_uuid);
+       } else {
+               err = vhd_open(&parent, ppath, VHD_OPEN_RDONLY);
+               if (err) {
+                       VHDLOG("error opening parent %s for %s: %d\n",
+                              ppath, child->file, err);
+                       goto out;
+               }
+               vhd_uuid_copy(&child->header.prt_uuid, &parent.footer.uuid);
+               vhd_close(&parent);
+       }
+
+       vhd_initialize_header_parent_name(child, ppath);
+       child->header.prt_ts = vhd_time(stats.st_mtime);
+
+       for (i = 0; i < vhd_parent_locator_count(child); i++) {
+               vhd_parent_locator_t *loc = child->header.loc + i;
+               size_t max = vhd_parent_locator_size(loc);
+
+               switch (loc->code) {
+               case PLAT_CODE_MACX:
+               case PLAT_CODE_W2KU:
+               case PLAT_CODE_W2RU:
+                       break;
+               default:
+                       continue;
+               }
+
+               err = vhd_parent_locator_write_at(child, ppath,
+                                                 loc->data_offset,
+                                                 loc->code, max, loc);
+               if (err) {
+                       VHDLOG("error writing parent locator %d for %s: %d\n",
+                              i, child->file, err);
+                       goto out;
+               }
+       }
+
+       TEST_FAIL_AT(FAIL_REPARENT_LOCATOR);
+
+       err = vhd_write_header(child, &child->header);
+       if (err) {
+               VHDLOG("error writing header for %s: %d\n", child->file, err);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(ppath);
+       return err;
+}
+
+static int
+vhd_create_batmap(vhd_context_t *ctx)
+{
+       off_t off;
+       int err, map_bytes;
+       vhd_batmap_header_t *header;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       map_bytes = (ctx->header.max_bat_size + 7) >> 3;
+       header    = &ctx->batmap.header;
+
+       memset(header, 0, sizeof(vhd_batmap_header_t));
+       memcpy(header->cookie, VHD_BATMAP_COOKIE, sizeof(header->cookie));
+
+       err = vhd_batmap_header_offset(ctx, &off);
+       if (err)
+               return err;
+
+       header->batmap_offset  = off +
+               vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+       header->batmap_size    = secs_round_up_no_zero(map_bytes);
+       header->batmap_version = VHD_BATMAP_CURRENT_VERSION;
+
+       map_bytes = vhd_sectors_to_bytes(header->batmap_size);
+
+       err = posix_memalign((void **)&ctx->batmap.map,
+                            VHD_SECTOR_SIZE, map_bytes);
+       if (err) {
+               ctx->batmap.map = NULL;
+               return -err;
+       }
+
+       memset(ctx->batmap.map, 0, map_bytes);
+
+       return vhd_write_batmap(ctx, &ctx->batmap);
+}
+
+static int
+vhd_create_bat(vhd_context_t *ctx)
+{
+       int i, err;
+       size_t size;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+       err  = posix_memalign((void **)&ctx->bat.bat, VHD_SECTOR_SIZE, size);
+       if (err) {
+               ctx->bat.bat = NULL;
+               return err;
+       }
+
+       memset(ctx->bat.bat, 0, size);
+       for (i = 0; i < ctx->header.max_bat_size; i++)
+               ctx->bat.bat[i] = DD_BLK_UNUSED;
+
+       err = vhd_seek(ctx, ctx->header.table_offset, SEEK_SET);
+       if (err)
+               return err;
+
+       ctx->bat.entries = ctx->header.max_bat_size;
+       ctx->bat.spb     = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+
+       return vhd_write_bat(ctx, &ctx->bat);
+}
+
+static int
+vhd_initialize_fixed_disk(vhd_context_t *ctx)
+{
+       char *buf;
+       int i, err;
+
+       if (ctx->footer.type != HD_TYPE_FIXED)
+               return -EINVAL;
+
+       err = vhd_seek(ctx, 0, SEEK_SET);
+       if (err)
+               return err;
+
+       buf = mmap(0, VHD_BLOCK_SIZE, PROT_READ,
+                  MAP_SHARED | MAP_ANON, -1, 0);
+       if (buf == MAP_FAILED)
+               return -errno;
+
+       for (i = 0; i < ctx->footer.curr_size >> VHD_BLOCK_SHIFT; i++) {
+               err = vhd_write(ctx, buf, VHD_BLOCK_SIZE);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+out:
+       munmap(buf, VHD_BLOCK_SIZE);
+       return err;
+}
+
+int 
+vhd_get_phys_size(vhd_context_t *ctx, off_t *size)
+{
+       int err;
+
+       if ((err = vhd_end_of_data(ctx, size)))
+               return err;
+       *size += sizeof(vhd_footer_t);
+       return 0;
+}
+
+int 
+vhd_set_phys_size(vhd_context_t *ctx, off_t size)
+{
+       off_t phys_size;
+       int err;
+
+       err = vhd_get_phys_size(ctx, &phys_size);
+       if (err)
+               return err;
+       if (size < phys_size) {
+               // would result in data loss
+               VHDLOG("ERROR: new size (%"PRIu64") < phys size (%"PRIu64")\n",
+                               size, phys_size);
+               return -EINVAL;
+       }
+       return vhd_write_footer_at(ctx, &ctx->footer, 
+                       size - sizeof(vhd_footer_t));
+}
+
+static int
+__vhd_create(const char *name, const char *parent, uint64_t bytes, int type,
+               vhd_flag_creat_t flags)
+{
+       int err;
+       off_t off;
+       vhd_context_t ctx;
+       vhd_footer_t *footer;
+       vhd_header_t *header;
+       uint64_t size, blks;
+
+       switch (type) {
+       case HD_TYPE_DIFF:
+               if (!parent)
+                       return -EINVAL;
+       case HD_TYPE_FIXED:
+       case HD_TYPE_DYNAMIC:
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (strnlen(name, VHD_MAX_NAME_LEN - 1) == VHD_MAX_NAME_LEN - 1)
+               return -ENAMETOOLONG;
+
+       memset(&ctx, 0, sizeof(vhd_context_t));
+       footer = &ctx.footer;
+       header = &ctx.header;
+       blks   = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+       size   = blks << VHD_BLOCK_SHIFT;
+
+       ctx.fd = open(name, O_WRONLY | O_CREAT |
+                     O_TRUNC | O_LARGEFILE | O_DIRECT, 0644);
+       if (ctx.fd == -1)
+               return -errno;
+
+       ctx.file = strdup(name);
+       if (!ctx.file) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = vhd_test_file_fixed(ctx.file, &ctx.is_block);
+       if (err)
+               goto out;
+
+       vhd_initialize_footer(&ctx, type, size);
+
+       if (type == HD_TYPE_FIXED) {
+               err = vhd_initialize_fixed_disk(&ctx);
+               if (err)
+                       goto out;
+       } else {
+               int raw = vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW);
+               err = vhd_initialize_header(&ctx, parent, size, raw);
+               if (err)
+                       goto out;
+
+               err = vhd_write_footer_at(&ctx, &ctx.footer, 0);
+               if (err)
+                       goto out;
+
+               err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE);
+               if (err)
+                       goto out;
+
+               err = vhd_create_batmap(&ctx);
+               if (err)
+                       goto out;
+
+               err = vhd_create_bat(&ctx);
+               if (err)
+                       goto out;
+
+               if (type == HD_TYPE_DIFF) {
+                       err = vhd_write_parent_locators(&ctx, parent);
+                       if (err)
+                               goto out;
+               }
+
+               /* write header again since it may have changed */
+               err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE);
+               if (err)
+                       goto out;
+       }
+
+       err = vhd_seek(&ctx, 0, SEEK_END);
+       if (err)
+               goto out;
+
+       off = vhd_position(&ctx);
+       if (off == (off_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       if (ctx.is_block)
+               off -= sizeof(vhd_footer_t);
+
+       err = vhd_write_footer_at(&ctx, &ctx.footer, off);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       vhd_close(&ctx);
+       if (err && !ctx.is_block)
+               unlink(name);
+       return err;
+}
+
+int
+vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t flags)
+{
+       return __vhd_create(name, NULL, bytes, type, flags);
+}
+
+int
+vhd_snapshot(const char *name, uint64_t bytes, const char *parent,
+               vhd_flag_creat_t flags)
+{
+       return __vhd_create(name, parent, bytes, HD_TYPE_DIFF, flags);
+}
+
+static int
+__vhd_io_fixed_read(vhd_context_t *ctx,
+                   char *buf, uint64_t sec, uint32_t secs)
+{
+       int err;
+
+       err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET);
+       if (err)
+               return err;
+
+       return vhd_read(ctx, buf, vhd_sectors_to_bytes(secs));
+}
+
+static void
+__vhd_io_dynamic_copy_data(vhd_context_t *ctx,
+                          char *map, int map_off,
+                          char *bitmap, int bitmap_off,
+                          char *dst, char *src, int secs)
+{
+       int i;
+
+       for (i = 0; i < secs; i++) {
+               if (test_bit(map, map_off + i))
+                       goto next;
+
+               if (ctx && !vhd_bitmap_test(ctx, bitmap, bitmap_off + i))
+                       goto next;
+
+               memcpy(dst, src, VHD_SECTOR_SIZE);
+               set_bit(map, map_off + i);
+
+       next:
+               src += VHD_SECTOR_SIZE;
+               dst += VHD_SECTOR_SIZE;
+       }
+}
+
+static int
+__vhd_io_dynamic_read_link(vhd_context_t *ctx, char *map,
+                          char *buf, uint64_t sector, uint32_t secs)
+{
+       off_t off;
+       uint32_t blk, sec;
+       int err, cnt, map_off;
+       char *bitmap, *data, *src;
+
+       map_off = 0;
+
+       do {
+               blk    = sector / ctx->spb;
+               sec    = sector % ctx->spb;
+               off    = ctx->bat.bat[blk];
+               data   = NULL;
+               bitmap = NULL;
+
+               if (off == DD_BLK_UNUSED) {
+                       cnt = MIN(secs, ctx->spb);
+                       goto next;
+               }
+
+               err = vhd_read_bitmap(ctx, blk, &bitmap);
+               if (err)
+                       return err;
+
+               err = vhd_read_block(ctx, blk, &data);
+               if (err) {
+                       free(bitmap);
+                       return err;
+               }
+
+               cnt = MIN(secs, ctx->spb - sec);
+               src = data + vhd_sectors_to_bytes(sec);
+
+               __vhd_io_dynamic_copy_data(ctx,
+                                          map, map_off,
+                                          bitmap, sec,
+                                          buf, src, cnt);
+
+       next:
+               free(data);
+               free(bitmap);
+
+               secs    -= cnt;
+               sector  += cnt;
+               map_off += cnt;
+               buf     += vhd_sectors_to_bytes(cnt);
+
+       } while (secs);
+
+       return 0;
+}
+
+static int
+__raw_read_link(char *filename,
+               char *map, char *buf, uint64_t sec, uint32_t secs)
+{
+       int fd, err;
+       off_t off;
+       uint64_t size;
+       char *data;
+
+       err = 0;
+       errno = 0;
+       fd = open(filename, O_RDONLY | O_DIRECT | O_LARGEFILE);
+       if (fd == -1) {
+               VHDLOG("%s: failed to open: %d\n", filename, -errno);
+               return -errno;
+       }
+
+       off = lseek(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
+       if (off == (off_t)-1) {
+               VHDLOG("%s: seek(0x%08"PRIx64") failed: %d\n",
+                      filename, vhd_sectors_to_bytes(sec), -errno);
+               err = -errno;
+               goto close;
+       }
+
+       size = vhd_sectors_to_bytes(secs);
+       err = posix_memalign((void **)&data, VHD_SECTOR_SIZE, size);
+       if (err)
+               goto close;
+
+       err = read(fd, data, size);
+       if (err != size) {
+               VHDLOG("%s: reading of %"PRIu64" returned %d, errno: %d\n",
+                               filename, size, err, -errno);
+               free(data);
+               err = errno ? -errno : -EIO;
+               goto close;
+       }
+       __vhd_io_dynamic_copy_data(NULL, map, 0, NULL, 0, buf, data, secs);
+       free(data);
+       err = 0;
+
+close:
+       close(fd);
+       return err;
+}
+
+static int
+__vhd_io_dynamic_read(vhd_context_t *ctx,
+                     char *buf, uint64_t sec, uint32_t secs)
+{
+       int err;
+       uint32_t i, done;
+       char *map, *next;
+       vhd_context_t parent, *vhd;
+
+       err  = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       vhd  = ctx;
+       next = NULL;
+       map  = calloc(1, secs << (VHD_SECTOR_SHIFT - 3));
+       if (!map)
+               return -ENOMEM;
+
+       memset(buf, 0, vhd_sectors_to_bytes(secs));
+
+       for (;;) {
+               err = __vhd_io_dynamic_read_link(vhd, map, buf, sec, secs);
+               if (err)
+                       goto close;
+
+               for (done = 0, i = 0; i < secs; i++)
+                       if (test_bit(map, i))
+                               done++;
+
+               if (done == secs) {
+                       err = 0;
+                       goto close;
+               }
+
+               if (vhd->footer.type == HD_TYPE_DIFF) {
+                       err = vhd_parent_locator_get(vhd, &next);
+                       if (err)
+                               goto close;
+                       if (vhd_parent_raw(vhd)) {
+                               err = __raw_read_link(next, map, buf, sec,
+                                               secs);
+                               goto close;
+                       }
+               } else {
+                       err = 0;
+                       goto close;
+               }
+
+               if (vhd != ctx)
+                       vhd_close(vhd);
+               vhd = &parent;
+
+               err = vhd_open(vhd, next, VHD_OPEN_RDONLY);
+               if (err)
+                       goto out;
+
+               err = vhd_get_bat(vhd);
+               if (err)
+                       goto close;
+
+               free(next);
+               next = NULL;
+       }
+
+close:
+       if (vhd != ctx)
+               vhd_close(vhd);
+out:
+       free(map);
+       free(next);
+       return err;
+}
+
+int
+vhd_io_read(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs)
+{
+       if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size)
+               return -ERANGE;
+
+       if (!vhd_type_dynamic(ctx))
+               return __vhd_io_fixed_read(ctx, buf, sec, secs);
+
+       return __vhd_io_dynamic_read(ctx, buf, sec, secs);
+}
+
+static int
+__vhd_io_fixed_write(vhd_context_t *ctx,
+                    char *buf, uint64_t sec, uint32_t secs)
+{
+       int err;
+
+       err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET);
+       if (err)
+               return err;
+
+       return vhd_write(ctx, buf, vhd_sectors_to_bytes(secs));
+}
+
+static int
+__vhd_io_allocate_block(vhd_context_t *ctx, uint32_t block)
+{
+       char *buf;
+       size_t size;
+       off_t off, max;
+       int i, err, gap, spp;
+
+       spp = getpagesize() >> VHD_SECTOR_SHIFT;
+
+       err = vhd_end_of_data(ctx, &max);
+       if (err)
+               return err;
+
+       gap   = 0;
+       off   = max;
+       max >>= VHD_SECTOR_SHIFT;
+
+       /* data region of segment should begin on page boundary */
+       if ((max + ctx->bm_secs) % spp) {
+               gap  = (spp - ((max + ctx->bm_secs) % spp));
+               max += gap;
+       }
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               return err;
+
+       size = vhd_sectors_to_bytes(ctx->spb + ctx->bm_secs + gap);
+       buf  = mmap(0, size, PROT_READ, MAP_SHARED | MAP_ANON, -1, 0);
+       if (buf == MAP_FAILED)
+               return -errno;
+
+       err = vhd_write(ctx, buf, size);
+       if (err)
+               goto out;
+
+       ctx->bat.bat[block] = max;
+       err = vhd_write_bat(ctx, &ctx->bat);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       munmap(buf, size);
+       return err;
+}
+
+static int
+__vhd_io_dynamic_write(vhd_context_t *ctx,
+                      char *buf, uint64_t sector, uint32_t secs)
+{
+       char *map;
+       off_t off;
+       uint32_t blk, sec;
+       int i, err, cnt, ret;
+
+       if (vhd_sectors_to_bytes(sector + secs) > ctx->footer.curr_size)
+               return -ERANGE;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(ctx)) {
+               err = vhd_get_batmap(ctx);
+               if (err)
+                       return err;
+       }
+
+       do {
+               blk = sector / ctx->spb;
+               sec = sector % ctx->spb;
+
+               off = ctx->bat.bat[blk];
+               if (off == DD_BLK_UNUSED) {
+                       err = __vhd_io_allocate_block(ctx, blk);
+                       if (err)
+                               return err;
+
+                       off = ctx->bat.bat[blk];
+               }
+
+               off += ctx->bm_secs + sec;
+               err  = vhd_seek(ctx, vhd_sectors_to_bytes(off), SEEK_SET);
+               if (err)
+                       return err;
+
+               cnt = MIN(secs, ctx->spb - sec);
+               err = vhd_write(ctx, buf, vhd_sectors_to_bytes(cnt));
+               if (err)
+                       return err;
+
+               if (vhd_has_batmap(ctx) &&
+                   vhd_batmap_test(ctx, &ctx->batmap, blk))
+                       goto next;
+
+               err = vhd_read_bitmap(ctx, blk, &map);
+               if (err)
+                       return err;
+
+               for (i = 0; i < cnt; i++)
+                       vhd_bitmap_set(ctx, map, sec + i);
+
+               err = vhd_write_bitmap(ctx, blk, map);
+               if (err)
+                       goto fail;
+
+               if (vhd_has_batmap(ctx)) {
+                       for (i = 0; i < ctx->spb; i++)
+                               if (!vhd_bitmap_test(ctx, map, i)) {
+                                       free(map);
+                                       goto next;
+                               }
+
+                       vhd_batmap_set(ctx, &ctx->batmap, blk);
+                       err = vhd_write_batmap(ctx, &ctx->batmap);
+                       if (err)
+                               goto fail;
+               }
+
+               free(map);
+               map = NULL;
+
+       next:
+               secs   -= cnt;
+               sector += cnt;
+               buf    += vhd_sectors_to_bytes(cnt);
+       } while (secs);
+
+       err = 0;
+
+out:
+       ret = vhd_write_footer(ctx, &ctx->footer);
+       return (err ? err : ret);
+
+fail:
+       free(map);
+       goto out;
+}
+
+int
+vhd_io_write(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs)
+{
+       if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size)
+               return -ERANGE;
+
+       if (!vhd_type_dynamic(ctx))
+               return __vhd_io_fixed_write(ctx, buf, sec, secs);
+
+       return __vhd_io_dynamic_write(ctx, buf, sec, secs);
+}
diff --git a/tools/blktap2/vhd/lib/relative-path.c b/tools/blktap2/vhd/lib/relative-path.c
new file mode 100644 (file)
index 0000000..8b7cb71
--- /dev/null
@@ -0,0 +1,299 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "relative-path.h"
+
+#define sfree(ptr)         \
+do {                       \
+       free(ptr);         \
+       ptr = NULL;        \
+} while (0)
+
+/*
+ * count number of tokens between DELIMETER characters
+ */
+static int
+count_nodes(char *path)
+{
+       int i;
+       char *tmp;
+
+       if (!path)
+               return 0;
+
+       for (i = 0, tmp = path; *tmp != '\0'; tmp++)
+               if (*tmp == DELIMITER)
+                       i++;
+
+       return i;
+}
+
+/*
+ * return copy of next node in @path, or NULL
+ * @path is moved to the end of the next node
+ * @err is set to -errno on failure
+ * copy should be freed
+ */
+static char *
+next_node(char **path, int *err)
+{
+       int ret;
+       char *tmp, *start;
+
+       if (!path || !*path) {
+               *err = -EINVAL;
+               return NULL;
+       }
+
+       *err  = 0;
+       start = *path;
+
+       for (tmp = *path; *tmp != '\0'; tmp++)
+               if (*tmp == DELIMITER) {
+                       int size;
+                       char *node;
+
+                       size = tmp - start + 1;
+                       node = malloc(size);
+                       if (!node) {
+                               *err = -ENOMEM;
+                               return NULL;
+                       }
+
+                       ret = snprintf(node, size, "%s", start);
+                       if (ret < 0) {
+                               free(node);
+                               *err = -EINVAL;
+                               return NULL;
+                       }
+
+                       *path = tmp;
+                       return node;
+               }
+
+       return NULL;
+}
+
+/*
+ * count number of nodes in common betwee @to and @from
+ * returns number of common nodes, or -errno on failure
+ */
+static int
+count_common_nodes(char *to, char *from)
+{
+       int err, common;
+       char *to_node, *from_node;
+
+       if (!to || !from)
+               return -EINVAL;
+
+       err       = 0;
+       common    = 0;
+       to_node   = NULL;
+       from_node = NULL;
+
+       do {
+               to_node = next_node(&to, &err);
+               if (err || !to_node)
+                       break;
+
+               from_node = next_node(&from, &err);
+               if (err || !from_node)
+                       break;
+
+               if (strncmp(to_node, from_node, MAX_NAME_LEN))
+                       break;
+
+               ++to;
+               ++from;
+               ++common;
+               sfree(to_node);
+               sfree(from_node);
+
+       } while (1);
+
+       sfree(to_node);
+       sfree(from_node);
+
+       if (err)
+               return err;
+
+       return common;
+}
+
+/*
+ * construct path of @count '../', './' if @count is zero, or NULL on error
+ * result should be freed
+ */
+static char *
+up_nodes(int count)
+{
+       char *path, *tmp;
+       int i, ret, len, size;
+
+       if (!count)
+               return strdup("./");
+
+       len  = strlen("../");
+       size = len * count;
+       if (size >= MAX_NAME_LEN)
+               return NULL;
+
+       path = malloc(size + 1);
+       if (!path)
+               return NULL;
+
+       tmp = path;
+       for (i = 0; i < count; i++) {
+               ret = sprintf(tmp, "../");
+               if (ret < 0 || ret != len) {
+                       free(path);
+                       return NULL;
+               }
+               tmp += ret;
+       }
+
+       return path;
+}
+
+/*
+ * return pointer to @offset'th node of path or NULL on error
+ */
+static char *
+node_offset(char *from, int offset)
+{
+       char *path;
+
+       if (!from || !offset)
+               return NULL;
+
+       for (path = from; *path != '\0'; path++) {
+               if (*path == DELIMITER)
+                       if (--offset == 0)
+                               return path + 1;
+       }
+
+       return NULL;
+}
+
+/*
+ * return a relative path from @from to @to
+ * result should be freed
+ */
+char *
+relative_path_to(char *from, char *to, int *err)
+{
+       int from_nodes, common;
+       char *to_absolute, *from_absolute;
+       char *up, *common_target_path, *relative_path;
+
+       *err          = 0;
+       up            = NULL;
+       to_absolute   = NULL;
+       from_absolute = NULL;
+       relative_path = NULL;
+
+       if (strnlen(to, MAX_NAME_LEN)   == MAX_NAME_LEN ||
+           strnlen(from, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               EPRINTF("invalid input; max path length is %d\n",
+                       MAX_NAME_LEN);
+               *err = -ENAMETOOLONG;
+               return NULL;
+       }
+
+       to_absolute = realpath(to, NULL);
+       if (!to_absolute) {
+               EPRINTF("failed to get absolute path of %s\n", to);
+               *err = -errno;
+               goto out;
+       }
+
+       from_absolute = realpath(from, NULL);
+       if (!from_absolute) {
+               EPRINTF("failed to get absolute path of %s\n", from);
+               *err = -errno;
+               goto out;
+       }
+
+       if (strnlen(to_absolute, MAX_NAME_LEN)   == MAX_NAME_LEN ||
+           strnlen(from_absolute, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               EPRINTF("invalid input; max path length is %d\n",
+                       MAX_NAME_LEN);
+               *err = -ENAMETOOLONG;
+               goto out;
+       }
+
+       /* count nodes in source path */
+       from_nodes = count_nodes(from_absolute);
+
+       /* count nodes in common */
+       common = count_common_nodes(to_absolute + 1, from_absolute + 1);
+       if (common < 0) {
+               EPRINTF("failed to count common nodes of %s and %s: %d\n",
+                       to_absolute, from_absolute, common);
+               *err = common;
+               goto out;
+       }
+
+       /* move up to common node */
+       up = up_nodes(from_nodes - common - 1);
+       if (!up) {
+               EPRINTF("failed to allocate relative path for %s: %d\n",
+                       from_absolute, -ENOMEM);
+               *err = -ENOMEM;
+               goto out;
+       }
+
+       /* get path from common node to target */
+       common_target_path = node_offset(to_absolute, common + 1);
+       if (!common_target_path) {
+               EPRINTF("failed to find common target path to %s: %d\n",
+                       to_absolute, -EINVAL);
+               *err = -EINVAL;
+               goto out;
+       }
+
+       /* get relative path */
+       if (asprintf(&relative_path, "%s%s", up, common_target_path) == -1) {
+               EPRINTF("failed to construct final path %s%s: %d\n",
+                       up, common_target_path, -ENOMEM);
+               relative_path = NULL;
+               *err = -ENOMEM;
+               goto out;
+       }
+
+out:
+       sfree(up);
+       sfree(to_absolute);
+       sfree(from_absolute);
+
+       return relative_path;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-check.c b/tools/blktap2/vhd/lib/vhd-util-check.c
new file mode 100644 (file)
index 0000000..40565ac
--- /dev/null
@@ -0,0 +1,980 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+// allow the VHD timestamp to be at most this many seconds into the future to 
+// account for time skew with NFS servers
+#define TIMESTAMP_MAX_SLACK 1800
+
+static int
+vhd_util_check_zeros(void *buf, size_t size)
+{
+       int i;
+       char *p;
+
+       p = buf;
+       for (i = 0; i < size; i++)
+               if (p[i])
+                       return i;
+
+       return 0;
+}
+
+static int
+vhd_util_check_footer_opened(vhd_footer_t *footer)
+{
+       int i, n;
+       uint32_t *buf;
+
+       buf = (uint32_t *)footer;
+       n = sizeof(*footer) / sizeof(uint32_t);
+
+       for (i = 0; i < n; i++)
+               if (buf[i] != 0xc7c7c7c7)
+                       return 0;
+
+       return 1;
+}
+
+static char *
+vhd_util_check_validate_footer(vhd_footer_t *footer)
+{
+       int size;
+       uint32_t checksum, now;
+
+       size = sizeof(footer->cookie);
+       if (memcmp(footer->cookie, HD_COOKIE, size))
+               return "invalid cookie";
+
+       checksum = vhd_checksum_footer(footer);
+       if (checksum != footer->checksum) {
+               if (footer->hidden &&
+                   !strncmp(footer->crtr_app, "tap", 3) &&
+                   (footer->crtr_ver == VHD_VERSION(0, 1) ||
+                    footer->crtr_ver == VHD_VERSION(1, 1))) {
+                       char tmp = footer->hidden;
+                       footer->hidden = 0;
+                       checksum = vhd_checksum_footer(footer);
+                       footer->hidden = tmp;
+
+                       if (checksum == footer->checksum)
+                               goto ok;
+               }
+
+               return "invalid checksum";
+       }
+
+ok:
+       if (!(footer->features & HD_RESERVED))
+               return "invalid 'reserved' feature";
+
+       if (footer->features & ~(HD_TEMPORARY | HD_RESERVED))
+               return "invalid extra features";
+
+       if (footer->ff_version != HD_FF_VERSION)
+               return "invalid file format version";
+
+       if (footer->type != HD_TYPE_DYNAMIC &&
+           footer->type != HD_TYPE_DIFF    &&
+           footer->data_offset != ~(0ULL))
+               return "invalid data offset";
+
+       now = vhd_time(time(NULL));
+       if (footer->timestamp > now + TIMESTAMP_MAX_SLACK)
+               return "creation time in future";
+
+       if (!strncmp(footer->crtr_app, "tap", 3) &&
+           footer->crtr_ver > VHD_CURRENT_VERSION)
+               return "unsupported tap creator version";
+
+       if (vhd_chs(footer->curr_size) < footer->geometry)
+               return "geometry too large";
+
+       if (footer->type != HD_TYPE_FIXED   &&
+           footer->type != HD_TYPE_DYNAMIC &&
+           footer->type != HD_TYPE_DIFF)
+               return "invalid type";
+
+       if (footer->saved && footer->saved != 1)
+               return "invalid 'saved' state";
+
+       if (footer->hidden && footer->hidden != 1)
+               return "invalid 'hidden' state";
+
+       if (vhd_util_check_zeros(footer->reserved,
+                                sizeof(footer->reserved)))
+               return "invalid 'reserved' bits";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_header(int fd, vhd_header_t *header)
+{
+       off_t eof;
+       int i, cnt, size;
+       uint32_t checksum;
+
+       size = sizeof(header->cookie);
+       if (memcmp(header->cookie, DD_COOKIE, size))
+               return "invalid cookie";
+
+       checksum = vhd_checksum_header(header);
+       if (checksum != header->checksum)
+               return "invalid checksum";
+
+       if (header->hdr_ver != 0x00010000)
+               return "invalid header version";
+
+       if (header->data_offset != ~(0ULL))
+               return "invalid data offset";
+
+       eof = lseek(fd, 0, SEEK_END);
+       if (eof == (off_t)-1)
+               return "error finding eof";
+
+       if (header->table_offset <= 0  ||
+           header->table_offset % 512 ||
+           (header->table_offset +
+            (header->max_bat_size * sizeof(uint32_t)) >
+            eof - sizeof(vhd_footer_t)))
+               return "invalid table offset";
+
+       for (cnt = 0, i = 0; i < sizeof(header->block_size) * 8; i++)
+               if ((header->block_size >> i) & 1)
+                       cnt++;
+
+       if (cnt != 1)
+               return "invalid block size";
+
+       if (header->res1)
+               return "invalid reserved bits";
+
+       if (vhd_util_check_zeros(header->res2, sizeof(header->res2)))
+               return "invalid reserved bits";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_differencing_header(vhd_context_t *vhd)
+{
+       vhd_header_t *header;
+
+       header = &vhd->header;
+
+       if (vhd->footer.type == HD_TYPE_DIFF) {
+               char *parent;
+               uint32_t now;
+
+               now = vhd_time(time(NULL));
+               if (header->prt_ts > now + TIMESTAMP_MAX_SLACK)
+                       return "parent creation time in future";
+
+               if (vhd_header_decode_parent(vhd, header, &parent))
+                       return "invalid parent name";
+
+               free(parent);
+       } else {
+               if (vhd_util_check_zeros(header->prt_name,
+                                        sizeof(header->prt_name)))
+                       return "invalid non-null parent name";
+
+               if (vhd_util_check_zeros(header->loc, sizeof(header->loc)))
+                       return "invalid non-null parent locators";
+
+               if (!vhd_uuid_is_nil(&header->prt_uuid))
+                       return "invalid non-null parent uuid";
+
+               if (header->prt_ts)
+                       return "invalid non-zero parent timestamp";
+       }
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_batmap(vhd_context_t *vhd, vhd_batmap_t *batmap)
+{
+       int size;
+       off_t eof;
+       uint32_t checksum;
+
+       size = sizeof(batmap->header.cookie);
+       if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, size))
+               return "invalid cookie";
+
+       if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION)
+               return "unsupported batmap version";
+
+       checksum = vhd_checksum_batmap(batmap);
+       if (checksum != batmap->header.checksum)
+               return "invalid checksum";
+
+       if (!batmap->header.batmap_size)
+               return "invalid size zero";
+
+       eof = lseek(vhd->fd, 0, SEEK_END);
+       if (eof == (off_t)-1)
+               return "error finding eof";
+
+       if (!batmap->header.batmap_offset ||
+           batmap->header.batmap_offset % 512)
+               return "invalid batmap offset";
+
+       if ((batmap->header.batmap_offset +
+            vhd_sectors_to_bytes(batmap->header.batmap_size)) >
+           eof - sizeof(vhd_footer_t))
+               return "invalid batmap size";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_parent_locator(vhd_context_t *vhd,
+                                      vhd_parent_locator_t *loc)
+{
+       off_t eof;
+
+       if (vhd_validate_platform_code(loc->code))
+               return "invalid platform code";
+
+       if (loc->code == PLAT_CODE_NONE) {
+               if (vhd_util_check_zeros(loc, sizeof(*loc)))
+                       return "non-zero locator";
+
+               return NULL;
+       }
+
+       if (!loc->data_offset)
+               return "invalid data offset";
+
+       if (!loc->data_space)
+               return "invalid data space";
+
+       if (!loc->data_len)
+               return "invalid data length";
+
+       eof = lseek(vhd->fd, 0, SEEK_END);
+       if (eof == (off_t)-1)
+               return "error finding eof";
+
+       if (loc->data_offset + vhd_parent_locator_size(loc) >
+           eof - sizeof(vhd_footer_t))
+               return "invalid size";
+
+       if (loc->res)
+               return "invalid reserved bits";
+
+       return NULL;
+}
+
+static const char *
+vhd_util_check_validate_parent(vhd_context_t *vhd, const char *ppath)
+{
+       const char *msg;
+       vhd_context_t parent;
+       uint32_t status;
+
+       msg = NULL;
+
+       if (vhd_parent_raw(vhd))
+               return msg;
+
+       if (vhd_open(&parent, ppath,
+                               VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED))
+               return "error opening parent";
+
+       if (vhd_uuid_compare(&vhd->header.prt_uuid, &parent.footer.uuid)) {
+               msg = "invalid parent uuid";
+               goto out;
+       }
+
+out:
+       vhd_close(&parent);
+       return msg;
+}
+
+static int
+vhd_util_check_footer(int fd, vhd_footer_t *footer, int ignore)
+{
+       size_t size;
+       int err, opened;
+       char *msg, *buf = NULL;
+       off_t eof, off;
+       vhd_footer_t primary, backup;
+
+       memset(&primary, 0, sizeof(primary));
+       memset(&backup, 0, sizeof(backup));
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, sizeof(primary));
+       if (err) {
+               printf("error allocating buffer: %d\n", err);
+               return -err;
+       }
+
+       memset(buf, 0, sizeof(primary));
+
+       eof = lseek(fd, 0, SEEK_END);
+       if (eof == (off_t)-1) {
+               err = -errno;
+               printf("error calculating end of file: %d\n", err);
+               goto out;
+       }
+
+       size = ((eof % 512) ? 511 : 512);
+       eof  = lseek(fd, eof - size, SEEK_SET);
+       if (eof == (off_t)-1) {
+               err = -errno;
+               printf("error calculating end of file: %d\n", err);
+               goto out;
+       }
+
+       err = read(fd, buf, 512);
+       if (err != size) {
+               err = (errno ? -errno : -EIO);
+               printf("error reading primary footer: %d\n", err);
+               goto out;
+       }
+
+       memcpy(&primary, buf, sizeof(primary));
+       opened = vhd_util_check_footer_opened(&primary);
+       vhd_footer_in(&primary);
+
+       msg = vhd_util_check_validate_footer(&primary);
+       if (msg) {
+               if (opened && ignore)
+                       goto check_backup;
+
+               err = -EINVAL;
+               printf("primary footer invalid: %s\n", msg);
+               goto out;
+       }
+
+       if (primary.type == HD_TYPE_FIXED) {
+               err = 0;
+               goto out;
+       }
+
+check_backup:
+       off = lseek(fd, 0, SEEK_SET);
+       if (off == (off_t)-1) {
+               err = -errno;
+               printf("error seeking to backup footer: %d\n", err);
+               goto out;
+       }
+
+       size = 512;
+       memset(buf, 0, sizeof(primary));
+
+       err = read(fd, buf, size);
+       if (err != size) {
+               err = (errno ? -errno : -EIO);
+               printf("error reading backup footer: %d\n", err);
+               goto out;
+       }
+
+       memcpy(&backup, buf, sizeof(backup));
+       vhd_footer_in(&backup);
+
+       msg = vhd_util_check_validate_footer(&backup);
+       if (msg) {
+               err = -EINVAL;
+               printf("backup footer invalid: %s\n", msg);
+               goto out;
+       }
+
+       if (memcmp(&primary, &backup, sizeof(primary))) {
+               if (opened && ignore) {
+                       memcpy(&primary, &backup, sizeof(primary));
+                       goto ok;
+               }
+
+               if (backup.hidden &&
+                   !strncmp(backup.crtr_app, "tap", 3) &&
+                   (backup.crtr_ver == VHD_VERSION(0, 1) ||
+                    backup.crtr_ver == VHD_VERSION(1, 1))) {
+                       char cmp, tmp = backup.hidden;
+                       backup.hidden = 0;
+                       cmp = memcmp(&primary, &backup, sizeof(primary));
+                       backup.hidden = tmp;
+                       if (!cmp)
+                               goto ok;
+               }
+
+               err = -EINVAL;
+               printf("primary and backup footers do not match\n");
+               goto out;
+       }
+
+ok:
+       err = 0;
+       memcpy(footer, &primary, sizeof(primary));
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_util_check_header(int fd, vhd_footer_t *footer)
+{
+       int err;
+       off_t off;
+       char *msg, *buf;
+       vhd_header_t header;
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, sizeof(header));
+       if (err) {
+               printf("error allocating header: %d\n", err);
+               return err;
+       }
+
+       off = footer->data_offset;
+       off = lseek(fd, off, SEEK_SET);
+       if (off == (off_t)-1) {
+               err = -errno;
+               printf("error seeking to header: %d\n", err);
+               goto out;
+       }
+
+       err = read(fd, buf, sizeof(header));
+       if (err != sizeof(header)) {
+               err = (errno ? -errno : -EIO);
+               printf("error reading header: %d\n", err);
+               goto out;
+       }
+
+       memcpy(&header, buf, sizeof(header));
+       vhd_header_in(&header);
+
+       msg = vhd_util_check_validate_header(fd, &header);
+       if (msg) {
+               err = -EINVAL;
+               printf("header is invalid: %s\n", msg);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_util_check_differencing_header(vhd_context_t *vhd)
+{
+       char *msg;
+
+       msg = vhd_util_check_validate_differencing_header(vhd);
+       if (msg) {
+               printf("differencing header is invalid: %s\n", msg);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_check_bat(vhd_context_t *vhd)
+{
+       off_t eof, eoh;
+       int i, j, err, block_size;
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err) {
+               printf("error calculating eof: %d\n", err);
+               return err;
+       }
+
+       eof = vhd_position(vhd);
+       if (eof == (off_t)-1) {
+               printf("error calculating eof: %d\n", -errno);
+               return -errno;
+       }
+
+       /* adjust eof for vhds with short footers */
+       if (eof % 512) {
+               if (eof % 512 != 511) {
+                       printf("invalid file size: 0x%"PRIx64"\n", eof);
+                       return -EINVAL;
+               }
+
+               eof++;
+       }
+
+       err = vhd_get_bat(vhd);
+       if (err) {
+               printf("error reading bat: %d\n", err);
+               return err;
+       }
+
+       err = vhd_end_of_headers(vhd, &eoh);
+       if (err) {
+               printf("error calculating end of metadata: %d\n", err);
+               return err;
+       }
+
+       eof  -= sizeof(vhd_footer_t);
+       eof >>= VHD_SECTOR_SHIFT;
+       eoh >>= VHD_SECTOR_SHIFT;
+       block_size = vhd->spb + vhd->bm_secs;
+
+       for (i = 0; i < vhd->header.max_bat_size; i++) {
+               uint32_t off = vhd->bat.bat[i];
+               if (off == DD_BLK_UNUSED)
+                       continue;
+
+               if (off < eoh) {
+                       printf("block %d (offset 0x%x) clobbers headers\n",
+                              i, off);
+                       return -EINVAL;
+               }
+
+               if (off + block_size > eof) {
+                       printf("block %d (offset 0x%x) clobbers footer\n",
+                              i, off);
+                       return -EINVAL;
+               }
+
+               for (j = 0; j < vhd->header.max_bat_size; j++) {
+                       uint32_t joff = vhd->bat.bat[j];
+
+                       if (i == j)
+                               continue;
+
+                       if (joff == DD_BLK_UNUSED)
+                               continue;
+
+                       if (off == joff)
+                               err = -EINVAL;
+
+                       if (off > joff && off < joff + block_size)
+                               err = -EINVAL;
+
+                       if (off + block_size > joff &&
+                           off + block_size < joff + block_size)
+                               err = -EINVAL;
+
+                       if (err) {
+                               printf("block %d (offset 0x%x) clobbers "
+                                      "block %d (offset 0x%x)\n",
+                                      i, off, j, joff);
+                               return err;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_check_batmap(vhd_context_t *vhd)
+{
+       char *msg;
+       int i, err;
+
+       err = vhd_get_bat(vhd);
+       if (err) {
+               printf("error reading bat: %d\n", err);
+               return err;
+       }
+
+       err = vhd_get_batmap(vhd);
+       if (err) {
+               printf("error reading batmap: %d\n", err);
+               return err;
+       }
+
+       msg = vhd_util_check_validate_batmap(vhd, &vhd->batmap);
+       if (msg) {
+               printf("batmap is invalid: %s\n", msg);
+               return -EINVAL;
+       }
+
+       for (i = 0; i < vhd->header.max_bat_size; i++) {
+               if (!vhd_batmap_test(vhd, &vhd->batmap, i))
+                       continue;
+
+               if (vhd->bat.bat[i] == DD_BLK_UNUSED) {
+                       printf("batmap shows unallocated block %d full\n", i);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_check_parent_locators(vhd_context_t *vhd)
+{
+       int i, n, err;
+       vhd_parent_locator_t *loc;
+       char *file, *ppath, *location, *pname;
+       const char *msg;
+       int mac, macx, w2ku, w2ru, wi2r, wi2k, found;
+
+       mac      = 0;
+       macx     = 0;
+       w2ku     = 0;
+       w2ru     = 0;
+       wi2r     = 0;
+       wi2k     = 0;
+       found    = 0;
+       pname    = NULL;
+       ppath    = NULL;
+       location = NULL;
+
+       err = vhd_header_decode_parent(vhd, &vhd->header, &pname);
+       if (err) {
+               printf("error decoding parent name: %d\n", err);
+               return err;
+       }
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd->header.loc[0]);
+       for (i = 0; i < n; i++) {
+               ppath    = NULL;
+               location = NULL;
+               loc = vhd->header.loc + i;
+
+               msg = vhd_util_check_validate_parent_locator(vhd, loc);
+               if (msg) {
+                       err = -EINVAL;
+                       printf("invalid parent locator %d: %s\n", i, msg);
+                       goto out;
+               }
+
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               switch (loc->code) {
+               case PLAT_CODE_MACX:
+                       if (macx++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_MAC:
+                       if (mac++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_W2KU:
+                       if (w2ku++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_W2RU:
+                       if (w2ru++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_WI2R:
+                       if (wi2r++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_WI2K:
+                       if (wi2k++)
+                               goto dup;
+                       break;
+
+               default:
+                       err = -EINVAL;
+                       printf("invalid  platform code for locator %d\n", i);
+                       goto out;
+               }
+
+               if (loc->code != PLAT_CODE_MACX &&
+                   loc->code != PLAT_CODE_W2RU &&
+                   loc->code != PLAT_CODE_W2KU)
+                       continue;
+
+               err = vhd_parent_locator_read(vhd, loc, &ppath);
+               if (err) {
+                       printf("error reading parent locator %d: %d\n", i, err);
+                       goto out;
+               }
+
+               file = basename(ppath);
+               if (strcmp(pname, file)) {
+                       err = -EINVAL;
+                       printf("parent locator %d name (%s) does not match "
+                              "header name (%s)\n", i, file, pname);
+                       goto out;
+               }
+
+               err = vhd_find_parent(vhd, ppath, &location);
+               if (err) {
+                       printf("error resolving %s: %d\n", ppath, err);
+                       goto out;
+               }
+
+               err = access(location, R_OK);
+               if (err && loc->code == PLAT_CODE_MACX) {
+                       err = -errno;
+                       printf("parent locator %d points to missing file %s "
+                               "(resolved to %s)\n", i, ppath, location);
+                       goto out;
+               }
+
+               msg = vhd_util_check_validate_parent(vhd, location);
+               if (msg) {
+                       err = -EINVAL;
+                       printf("invalid parent %s: %s\n", location, msg);
+                       goto out;
+               }
+
+               found++;
+               free(ppath);
+               free(location);
+               ppath = NULL;
+               location = NULL;
+
+               continue;
+
+       dup:
+               printf("duplicate platform code in locator %d: 0x%x\n",
+                      i, loc->code);
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (!found) {
+               err = -EINVAL;
+               printf("could not find parent %s\n", pname);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(pname);
+       free(ppath);
+       free(location);
+       return err;
+}
+
+static void
+vhd_util_dump_headers(const char *name)
+{
+       char *argv[] = { "read", "-p", "-n", (char *)name };
+       int argc = sizeof(argv) / sizeof(argv[0]);
+
+       printf("%s appears invalid; dumping metadata\n", name);
+       vhd_util_read(argc, argv);
+}
+
+static int
+vhd_util_check_vhd(const char *name, int ignore)
+{
+       int fd, err;
+       vhd_context_t vhd;
+       struct stat stats;
+       vhd_footer_t footer;
+
+       fd = -1;
+       memset(&vhd, 0, sizeof(vhd));
+        memset(&footer, 0, sizeof(footer));
+
+       err = stat(name, &stats);
+       if (err == -1) {
+               printf("cannot stat %s: %d\n", name, errno);
+               return -errno;
+       }
+
+       if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+               printf("%s is not a regular file or block device\n", name);
+               return -EINVAL;
+       }
+
+       fd = open(name, O_RDONLY | O_DIRECT | O_LARGEFILE);
+       if (fd == -1) {
+               printf("error opening %s\n", name);
+               return -errno;
+       }
+
+       err = vhd_util_check_footer(fd, &footer, ignore);
+       if (err)
+               goto out;
+
+       if (footer.type != HD_TYPE_DYNAMIC && footer.type != HD_TYPE_DIFF)
+               goto out;
+
+       err = vhd_util_check_header(fd, &footer);
+       if (err)
+               goto out;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+       if (err)
+               goto out;
+
+       err = vhd_util_check_differencing_header(&vhd);
+       if (err)
+               goto out;
+
+       err = vhd_util_check_bat(&vhd);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(&vhd)) {
+               err = vhd_util_check_batmap(&vhd);
+               if (err)
+                       goto out;
+       }
+
+       if (vhd.footer.type == HD_TYPE_DIFF) {
+               err = vhd_util_check_parent_locators(&vhd);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+       printf("%s is valid\n", name);
+
+out:
+       if (err)
+               vhd_util_dump_headers(name);
+       if (fd != -1)
+               close(fd);
+       vhd_close(&vhd);
+       return err;
+}
+
+static int
+vhd_util_check_parents(const char *name, int ignore)
+{
+       int err;
+       vhd_context_t vhd;
+       char *cur, *parent;
+
+       cur = (char *)name;
+
+       for (;;) {
+               err = vhd_open(&vhd, cur, 
+                               VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+               if (err)
+                       goto out;
+
+               if (vhd.footer.type != HD_TYPE_DIFF || vhd_parent_raw(&vhd)) {
+                       vhd_close(&vhd);
+                       goto out;
+               }
+
+               err = vhd_parent_locator_get(&vhd, &parent);
+               vhd_close(&vhd);
+
+               if (err) {
+                       printf("error getting parent: %d\n", err);
+                       goto out;
+               }
+
+               if (cur != name)
+                       free(cur);
+               cur = parent;
+
+               err = vhd_util_check_vhd(cur, ignore);
+               if (err)
+                       goto out;
+       }
+
+out:
+       if (err)
+               printf("error checking parents: %d\n", err);
+       if (cur != name)
+               free(cur);
+       return err;
+}
+
+int
+vhd_util_check(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       int c, err, ignore, parents;
+
+       if (!argc || !argv) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       ignore  = 0;
+       parents = 0;
+       name    = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:iph")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'i':
+                       ignore = 1;
+                       break;
+               case 'p':
+                       parents = 1;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       err = vhd_util_check_vhd(name, ignore);
+       if (err)
+               goto out;
+
+       if (parents)
+               err = vhd_util_check_parents(name, ignore);
+
+out:
+       return err;
+
+usage:
+       printf("options: -n <file> [-i ignore missing primary footers] "
+              "[-p check parents] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-coalesce.c b/tools/blktap2/vhd/lib/vhd-util-coalesce.c
new file mode 100644 (file)
index 0000000..63dcf60
--- /dev/null
@@ -0,0 +1,218 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+static int
+__raw_io_write(int fd, char* buf, uint64_t sec, uint32_t secs)
+{
+       off_t off;
+       size_t ret;
+
+       errno = 0;
+       off = lseek(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
+       if (off == (off_t)-1) {
+               printf("raw parent: seek(0x%08"PRIx64") failed: %d\n",
+                      vhd_sectors_to_bytes(sec), -errno);
+               return -errno;
+       }
+
+       ret = write(fd, buf, vhd_sectors_to_bytes(secs));
+       if (ret == vhd_sectors_to_bytes(secs))
+               return 0;
+
+       printf("raw parent: write of 0x%"PRIx64" returned %zd, errno: %d\n",
+              vhd_sectors_to_bytes(secs), ret, -errno);
+       return (errno ? -errno : -EIO);
+}
+
+/*
+ * Use 'parent' if the parent is VHD, and 'parent_fd' if the parent is raw
+ */
+static int
+vhd_util_coalesce_block(vhd_context_t *vhd, vhd_context_t *parent,
+               int parent_fd, uint64_t block)
+{
+       int i, err;
+       char *buf, *map;
+       uint64_t sec, secs;
+
+       buf = NULL;
+       map = NULL;
+       sec = block * vhd->spb;
+
+       if (vhd->bat.bat[block] == DD_BLK_UNUSED)
+               return 0;
+
+       err = posix_memalign((void **)&buf, 4096, vhd->header.block_size);
+       if (err)
+               return -err;
+
+       err = vhd_io_read(vhd, buf, sec, vhd->spb);
+       if (err)
+               goto done;
+
+       if (vhd_has_batmap(vhd) && vhd_batmap_test(vhd, &vhd->batmap, block)) {
+               if (parent->file)
+                       err = vhd_io_write(parent, buf, sec, vhd->spb);
+               else
+                       err = __raw_io_write(parent_fd, buf, sec, vhd->spb);
+               goto done;
+       }
+
+       err = vhd_read_bitmap(vhd, block, &map);
+       if (err)
+               goto done;
+
+       for (i = 0; i < vhd->spb; i++) {
+               if (!vhd_bitmap_test(vhd, map, i))
+                       continue;
+
+               for (secs = 0; i + secs < vhd->spb; secs++)
+                       if (!vhd_bitmap_test(vhd, map, i + secs))
+                               break;
+
+               if (parent->file)
+                       err = vhd_io_write(parent,
+                                          buf + vhd_sectors_to_bytes(i),
+                                          sec + i, secs);
+               else
+                       err = __raw_io_write(parent_fd,
+                                            buf + vhd_sectors_to_bytes(i),
+                                            sec + i, secs);
+               if (err)
+                       goto done;
+
+               i += secs;
+       }
+
+       err = 0;
+
+done:
+       free(buf);
+       free(map);
+       return err;
+}
+
+int
+vhd_util_coalesce(int argc, char **argv)
+{
+       int err, c;
+       uint64_t i;
+       char *name, *pname;
+       vhd_context_t vhd, parent;
+       int parent_fd = -1;
+
+       name  = NULL;
+       pname = NULL;
+       parent.file = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       err = vhd_parent_locator_get(&vhd, &pname);
+       if (err) {
+               printf("error finding %s parent: %d\n", name, err);
+               vhd_close(&vhd);
+               return err;
+       }
+
+       if (vhd_parent_raw(&vhd)) {
+               parent_fd = open(pname, O_RDWR | O_DIRECT | O_LARGEFILE, 0644);
+               if (parent_fd == -1) {
+                       err = -errno;
+                       printf("failed to open parent %s: %d\n", pname, err);
+                       vhd_close(&vhd);
+                       return err;
+               }
+       } else {
+               err = vhd_open(&parent, pname, VHD_OPEN_RDWR);
+               if (err) {
+                       printf("error opening %s: %d\n", pname, err);
+                       free(pname);
+                       vhd_close(&vhd);
+                       return err;
+               }
+       }
+
+       err = vhd_get_bat(&vhd);
+       if (err)
+               goto done;
+
+       if (vhd_has_batmap(&vhd)) {
+               err = vhd_get_batmap(&vhd);
+               if (err)
+                       goto done;
+       }
+
+       for (i = 0; i < vhd.bat.entries; i++) {
+               err = vhd_util_coalesce_block(&vhd, &parent, parent_fd, i);
+               if (err)
+                       goto done;
+       }
+
+       err = 0;
+
+ done:
+       free(pname);
+       vhd_close(&vhd);
+       if (parent.file)
+               vhd_close(&parent);
+       else
+               close(parent_fd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-create.c b/tools/blktap2/vhd/lib/vhd-util-create.c
new file mode 100644 (file)
index 0000000..a9bdf05
--- /dev/null
@@ -0,0 +1,80 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_create(int argc, char **argv)
+{
+       char *name;
+       uint64_t size;
+       int c, sparse, err;
+       vhd_flag_creat_t flags;
+
+       err       = -EINVAL;
+       size      = 0;
+       sparse    = 1;
+       name      = NULL;
+       flags     = 0;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:s:rh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 's':
+                       err  = 0;
+                       size = strtoull(optarg, NULL, 10);
+                       break;
+               case 'r':
+                       sparse = 0;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (err || !name || optind != argc)
+               goto usage;
+
+       return vhd_create(name, size << 20,
+                                 (sparse ? HD_TYPE_DYNAMIC : HD_TYPE_FIXED),
+                                 flags);
+
+usage:
+       printf("options: <-n name> <-s size (MB)> [-r reserve] [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-fill.c b/tools/blktap2/vhd/lib/vhd-util-fill.c
new file mode 100644 (file)
index 0000000..afbfcce
--- /dev/null
@@ -0,0 +1,105 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_fill(int argc, char **argv)
+{
+       int err, c;
+       char *buf, *name;
+       vhd_context_t vhd;
+       uint64_t i, sec, secs;
+
+       buf  = NULL;
+       name = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       err = vhd_get_bat(&vhd);
+       if (err)
+               goto done;
+
+       err = posix_memalign((void **)&buf, 4096, vhd.header.block_size);
+       if (err) {
+               err = -err;
+               goto done;
+       }
+
+       sec  = 0;
+       secs = vhd.header.block_size >> VHD_SECTOR_SHIFT;
+
+       for (i = 0; i < vhd.header.max_bat_size; i++) {
+               err = vhd_io_read(&vhd, buf, sec, secs);
+               if (err)
+                       goto done;
+
+               err = vhd_io_write(&vhd, buf, sec, secs);
+               if (err)
+                       goto done;
+
+               sec += secs;
+       }
+
+       err = 0;
+
+ done:
+       free(buf);
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-modify.c b/tools/blktap2/vhd/lib/vhd-util-modify.c
new file mode 100644 (file)
index 0000000..b563d6a
--- /dev/null
@@ -0,0 +1,132 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Altering operations:
+ *
+ * 1. Change the parent pointer to another file.
+ * 2. Change the size of the file containing the VHD image. This does NOT 
+ * affect the VHD disk capacity, only the physical size of the file containing 
+ * the VHD. Naturally, it is not possible to set the file size to be less than  
+ * the what VHD utilizes.
+ * The operation doesn't actually change the file size, but it writes the 
+ * footer in the right location such that resizing the file (manually, as a 
+ * separate step) will produce the correct results. If the new file size is 
+ * greater than the current file size, the file must first be expanded and then 
+ * altered with this operation. If the new size is smaller than the current 
+ * size, the VHD must first be altered with this operation and then the file 
+ * must be shrunk. Failing to resize the file will result in a corrupted VHD.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+TEST_FAIL_EXTERN_VARS;
+
+int
+vhd_util_modify(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       int err, c, size, parent, parent_raw;
+       off_t newsize = 0;
+       char *newparent = NULL;
+
+       name       = NULL;
+       size       = 0;
+       parent     = 0;
+       parent_raw = 0;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:s:p:mh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 's':
+                       size = 1;
+                       errno = 0;
+                       newsize = strtoll(optarg, NULL, 10);
+                       if (errno) {
+                               fprintf(stderr, "Invalid size '%s'\n", optarg);
+                               goto usage;
+                       }
+                       break;
+               case 'p':
+                       parent = 1;
+                       newparent = optarg;
+                       break;
+               case 'm':
+                       parent_raw = 1;
+                       break;
+
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       if (size) {
+               err = vhd_set_phys_size(&vhd, newsize);
+               if (err)
+                       printf("failed to set physical size to %"PRIu64":"
+                              " %d\n", newsize, err);
+       }
+
+       if (parent) {
+               TEST_FAIL_AT(FAIL_REPARENT_BEGIN);
+               err = vhd_change_parent(&vhd, newparent, parent_raw);
+               if (err) {
+                       printf("failed to set parent to '%s': %d\n",
+                                       newparent, err);
+                       goto done;
+               }
+               TEST_FAIL_AT(FAIL_REPARENT_END);
+       }
+
+done:
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("*** Dangerous operations, use with care ***\n");
+       printf("options: <-n name> [-p NEW_PARENT set parent [-m raw]] "
+                       "[-s NEW_SIZE set size] [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-query.c b/tools/blktap2/vhd/lib/vhd-util-query.c
new file mode 100644 (file)
index 0000000..44a22d0
--- /dev/null
@@ -0,0 +1,159 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_query(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       off_t currsize;
+       int ret, err, c, size, physize, parent, fields, depth;
+
+       name    = NULL;
+       size    = 0;
+       physize = 0;
+       parent  = 0;
+       fields  = 0;
+       depth   = 0;
+
+       if (!argc || !argv) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:vspfdh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'v':
+                       size = 1;
+                       break;
+               case 's':
+                       physize = 1;
+                       break;
+               case 'p':
+                       parent = 1;
+                       break;
+               case 'f':
+                       fields = 1;
+                       break;
+               case 'd':
+                       depth = 1;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       if (size)
+               printf("%"PRIu64"\n", vhd.footer.curr_size >> 20);
+
+       if (physize) {
+               err = vhd_get_phys_size(&vhd, &currsize);
+               if (err)
+                       printf("failed to get physical size: %d\n", err);
+               else
+                       printf("%"PRIu64"\n", currsize);
+       }
+
+       if (parent) {
+               ret = 0;
+
+               if (vhd.footer.type != HD_TYPE_DIFF)
+                       printf("%s has no parent\n", name);
+               else {
+                       char *pname;
+
+                       ret = vhd_parent_locator_get(&vhd, &pname);
+                       if (ret)
+                               printf("query failed\n");
+                       else {
+                               printf("%s\n", pname);
+                               free(pname);
+                       }
+               }
+
+               err = (err ? : ret);
+       }
+
+       if (fields) {
+               int hidden;
+
+               ret = vhd_hidden(&vhd, &hidden);
+               if (ret)
+                       printf("error checking 'hidden' field: %d\n", ret);
+               else
+                       printf("hidden: %d\n", hidden);
+
+               err = (err ? : ret);
+       }
+
+       if (depth) {
+               int length;
+
+               ret = vhd_chain_depth(&vhd, &length);
+               if (ret)
+                       printf("error checking chain depth: %d\n", ret);
+               else
+                       printf("chain depth: %d\n", length);
+
+               err = (err ? : ret);
+       }
+               
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-v print virtual size (in MB)] "
+              "[-s print physical utilization (bytes)] [-p print parent] "
+              "[-f print fields] [-d print chain depth] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-read.c b/tools/blktap2/vhd/lib/vhd-util-read.c
new file mode 100644 (file)
index 0000000..ac4d833
--- /dev/null
@@ -0,0 +1,742 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+#define nsize     15
+static char nbuf[nsize];
+
+static inline char *
+__xconv(uint64_t num)
+{
+       snprintf(nbuf, nsize, "%#" PRIx64 , num);
+       return nbuf;
+}
+
+static inline char *
+__dconv(uint64_t num)
+{
+       snprintf(nbuf, nsize, "%" PRIu64, num);
+       return nbuf;
+}
+
+#define conv(hex, num) \
+       (hex ? __xconv((uint64_t)num) : __dconv((uint64_t)num))
+
+static void
+vhd_print_header(vhd_context_t *vhd, vhd_header_t *h, int hex)
+{
+       int err;
+       uint32_t  cksm;
+       char      uuid[39], time_str[26], cookie[9], out[512], *name;
+
+       printf("VHD Header Summary:\n-------------------\n");
+
+       snprintf(cookie, sizeof(cookie), "%s", h->cookie);
+       printf("Cookie              : %s\n", cookie);
+
+       printf("Data offset (unusd) : %s\n", conv(hex, h->data_offset));
+       printf("Table offset        : %s\n", conv(hex, h->table_offset));
+       printf("Header version      : 0x%08x\n", h->hdr_ver);
+       printf("Max BAT size        : %s\n", conv(hex, h->max_bat_size));
+       printf("Block size          : %s ", conv(hex, h->block_size));
+       printf("(%s MB)\n", conv(hex, h->block_size >> 20));
+
+       err = vhd_header_decode_parent(vhd, h, &name);
+       printf("Parent name         : %s\n",
+              (err ? "failed to read name" : name));
+       free(name);
+
+       vhd_uuid_to_string(&h->prt_uuid, uuid, sizeof(uuid));
+       printf("Parent UUID         : %s\n", uuid);
+    
+       vhd_time_to_string(h->prt_ts, time_str);
+       printf("Parent timestamp    : %s\n", time_str);
+
+       cksm = vhd_checksum_header(h);
+       printf("Checksum            : 0x%x|0x%x (%s)\n", h->checksum, cksm,
+               h->checksum == cksm ? "Good!" : "Bad!");
+       printf("\n");
+}
+
+static void
+vhd_print_footer(vhd_footer_t *f, int hex)
+{
+       uint64_t  c, h, s;
+       uint32_t  ff_maj, ff_min, cr_maj, cr_min, cksm, cksm_save;
+       char      time_str[26], creator[5], uuid[39], cookie[9];
+
+       printf("VHD Footer Summary:\n-------------------\n");
+
+       snprintf(cookie, sizeof(cookie), "%s", f->cookie);
+       printf("Cookie              : %s\n", cookie);
+
+       printf("Features            : (0x%08x) %s%s\n", f->features,
+               (f->features & HD_TEMPORARY) ? "<TEMP>" : "",
+               (f->features & HD_RESERVED)  ? "<RESV>" : "");
+
+       ff_maj = f->ff_version >> 16;
+       ff_min = f->ff_version & 0xffff;
+       printf("File format version : Major: %d, Minor: %d\n", 
+               ff_maj, ff_min);
+
+       printf("Data offset         : %s\n", conv(hex, f->data_offset));
+
+       vhd_time_to_string(f->timestamp, time_str);
+       printf("Timestamp           : %s\n", time_str);
+
+       memcpy(creator, f->crtr_app, 4);
+       creator[4] = '\0';
+       printf("Creator Application : '%s'\n", creator);
+
+       cr_maj = f->crtr_ver >> 16;
+       cr_min = f->crtr_ver & 0xffff;
+       printf("Creator version     : Major: %d, Minor: %d\n",
+               cr_maj, cr_min);
+
+       printf("Creator OS          : %s\n",
+               ((f->crtr_os == HD_CR_OS_WINDOWS) ? "Windows" :
+                ((f->crtr_os == HD_CR_OS_MACINTOSH) ? "Macintosh" : 
+                 "Unknown!")));
+
+       printf("Original disk size  : %s MB ", conv(hex, f->orig_size >> 20));
+       printf("(%s Bytes)\n", conv(hex, f->orig_size));
+
+       printf("Current disk size   : %s MB ", conv(hex, f->curr_size >> 20));
+       printf("(%s Bytes)\n", conv(hex, f->curr_size));
+
+       c = f->geometry >> 16;
+       h = (f->geometry & 0x0000FF00) >> 8;
+       s = f->geometry & 0x000000FF;
+       printf("Geometry            : Cyl: %s, ", conv(hex, c));
+       printf("Hds: %s, ", conv(hex, h));
+       printf("Sctrs: %s\n", conv(hex, s));
+       printf("                    : = %s MB ", conv(hex, (c * h * s) >> 11));
+       printf("(%s Bytes)\n", conv(hex, c * h * s << 9));
+
+       printf("Disk type           : %s\n", 
+               f->type <= HD_TYPE_MAX ? 
+               HD_TYPE_STR[f->type] : "Unknown type!\n");
+
+       cksm = vhd_checksum_footer(f);
+       printf("Checksum            : 0x%x|0x%x (%s)\n", f->checksum, cksm,
+               f->checksum == cksm ? "Good!" : "Bad!");
+
+       vhd_uuid_to_string(&f->uuid, uuid, sizeof(uuid));
+       printf("UUID                : %s\n", uuid);
+
+       printf("Saved state         : %s\n", f->saved == 0 ? "No" : "Yes");
+       printf("Hidden              : %d\n", f->hidden);
+       printf("\n");
+}
+
+static inline char *
+code_name(uint32_t code)
+{
+       switch(code) {
+       case PLAT_CODE_NONE:
+               return "PLAT_CODE_NONE";
+       case PLAT_CODE_WI2R:
+               return "PLAT_CODE_WI2R";
+       case PLAT_CODE_WI2K:
+               return "PLAT_CODE_WI2K";
+       case PLAT_CODE_W2RU:
+               return "PLAT_CODE_W2RU";
+       case PLAT_CODE_W2KU:
+               return "PLAT_CODE_W2KU";
+       case PLAT_CODE_MAC:
+               return "PLAT_CODE_MAC";
+       case PLAT_CODE_MACX:
+               return "PLAT_CODE_MACX";
+       default:
+               return "UNKOWN";
+       }
+}
+
+static void
+vhd_print_parent(vhd_context_t *vhd, vhd_parent_locator_t *loc)
+{
+       int err;
+       char *buf;
+
+       err = vhd_parent_locator_read(vhd, loc, &buf);
+       if (err) {
+               printf("failed to read parent name\n");
+               return;
+       }
+
+       printf("       decoded name : %s\n", buf);
+}
+
+static void
+vhd_print_parent_locators(vhd_context_t *vhd, int hex)
+{
+       int i, n;
+       vhd_parent_locator_t *loc;
+
+       printf("VHD Parent Locators:\n--------------------\n");
+
+       n = sizeof(vhd->header.loc) / sizeof(struct prt_loc);
+       for (i = 0; i < n; i++) {
+               loc = &vhd->header.loc[i];
+
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               printf("locator:            : %d\n", i);
+               printf("       code         : %s\n",
+                      code_name(loc->code));
+               printf("       data_space   : %s\n",
+                      conv(hex, loc->data_space));
+               printf("       data_length  : %s\n",
+                      conv(hex, loc->data_len));
+               printf("       data_offset  : %s\n",
+                      conv(hex, loc->data_offset));
+               vhd_print_parent(vhd, loc);
+               printf("\n");
+       }
+}
+
+static void
+vhd_print_batmap_header(vhd_batmap_t *batmap, int hex)
+{
+       uint32_t cksm;
+
+       printf("VHD Batmap Summary:\n-------------------\n");
+       printf("Batmap offset       : %s\n",
+              conv(hex, batmap->header.batmap_offset));
+       printf("Batmap size (secs)  : %s\n",
+              conv(hex, batmap->header.batmap_size));
+       printf("Batmap version      : 0x%08x\n",
+              batmap->header.batmap_version);
+
+       cksm = vhd_checksum_batmap(batmap);
+       printf("Checksum            : 0x%x|0x%x (%s)\n",
+              batmap->header.checksum, cksm,
+              (batmap->header.checksum == cksm ? "Good!" : "Bad!"));
+       printf("\n");
+}
+
+static inline int
+check_block_range(vhd_context_t *vhd, uint64_t block, int hex)
+{
+       if (block > vhd->header.max_bat_size) {
+               fprintf(stderr, "block %s past end of file\n",
+                       conv(hex, block));
+               return -ERANGE;
+       }
+
+       return 0;
+}
+
+static int
+vhd_print_headers(vhd_context_t *vhd, int hex)
+{
+       int err;
+
+       vhd_print_footer(&vhd->footer, hex);
+
+       if (vhd_type_dynamic(vhd)) {
+               vhd_print_header(vhd, &vhd->header, hex);
+
+               if (vhd->footer.type == HD_TYPE_DIFF)
+                       vhd_print_parent_locators(vhd, hex);
+
+               if (vhd_has_batmap(vhd)) {
+                       err = vhd_get_batmap(vhd);
+                       if (err) {
+                               printf("failed to get batmap header\n");
+                               return err;
+                       }
+
+                       vhd_print_batmap_header(&vhd->batmap, hex);
+               }
+       }
+
+       return 0;
+}
+
+static int
+vhd_dump_headers(const char *name, int hex)
+{
+       vhd_context_t vhd;
+
+       libvhd_set_log_level(1);
+       memset(&vhd, 0, sizeof(vhd));
+
+       printf("\n%s appears invalid; dumping headers\n\n", name);
+
+       vhd.fd = open(name, O_DIRECT | O_LARGEFILE | O_RDONLY);
+       if (vhd.fd == -1)
+               return -errno;
+
+       vhd.file = strdup(name);
+
+       vhd_read_footer(&vhd, &vhd.footer);
+       vhd_read_header(&vhd, &vhd.header);
+
+       vhd_print_footer(&vhd.footer, hex);
+       vhd_print_header(&vhd, &vhd.header, hex);
+
+       close(vhd.fd);
+       free(vhd.file);
+
+       return 0;
+}
+
+static int
+vhd_print_logical_to_physical(vhd_context_t *vhd,
+                             uint64_t sector, int count, int hex)
+{
+       int i;
+       uint32_t blk, lsec;
+       uint64_t cur, offset;
+
+       if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+               fprintf(stderr, "sector %s past end of file\n",
+                       conv(hex, sector + count));
+                       return -ERANGE;
+       }
+
+       for (i = 0; i < count; i++) {
+               cur    = sector + i;
+               blk    = cur / vhd->spb;
+               lsec   = cur % vhd->spb;
+               offset = vhd->bat.bat[blk];
+
+               if (offset != DD_BLK_UNUSED) {
+                       offset += lsec + 1;
+                       offset  = vhd_sectors_to_bytes(offset);
+               }
+
+               printf("logical sector %s: ", conv(hex, cur));
+               printf("block number: %s, ", conv(hex, blk));
+               printf("sector offset: %s, ", conv(hex, lsec));
+               printf("file offset: %s\n", (offset == DD_BLK_UNUSED ?
+                       "not allocated" : conv(hex, offset)));
+       }
+
+       return 0;
+}
+
+static int
+vhd_print_bat(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       int i;
+       uint64_t cur, offset;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       for (i = 0; i < count; i++) {
+               cur    = block + i;
+               offset = vhd->bat.bat[cur];
+
+               printf("block: %s: ", conv(hex, cur));
+               printf("offset: %s\n",
+                      (offset == DD_BLK_UNUSED ? "not allocated" :
+                       conv(hex, vhd_sectors_to_bytes(offset))));
+       }
+
+       return 0;
+}
+
+static inline void
+write_full(int fd, void* buf, size_t count)
+{
+       ssize_t num_written = 0;
+       if (!buf) return;
+       
+       
+       while(count > 0) {
+               
+               num_written = write(fd, buf, count);
+               if (num_written == -1) {
+                       if (errno == EINTR) 
+                               continue;
+                       else
+                               return;
+               }
+               
+               count -= num_written;
+               buf   += num_written;
+       }
+}
+
+static int
+vhd_print_bitmap(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       char *buf;
+       int i, err;
+       uint64_t cur;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       for (i = 0; i < count; i++) {
+               cur = block + i;
+
+               if (vhd->bat.bat[cur] == DD_BLK_UNUSED) {
+                       printf("block %s not allocated\n", conv(hex, cur));
+                       continue;
+               }
+
+               err = vhd_read_bitmap(vhd, cur, &buf);
+               if (err)
+                       goto out;
+
+               write_full(STDOUT_FILENO, buf, 
+                          vhd_sectors_to_bytes(vhd->bm_secs));
+               free(buf);
+       }
+
+       err = 0;
+out:
+       return err;
+}
+
+static int
+vhd_test_bitmap(vhd_context_t *vhd, uint64_t sector, int count, int hex)
+{
+       char *buf;
+       uint64_t cur;
+       int i, err, bit;
+       uint32_t blk, bm_blk, sec;
+
+       if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+               printf("sector %s past end of file\n", conv(hex, sector));
+               return -ERANGE;
+       }
+
+       bm_blk = -1;
+       buf    = NULL;
+
+       for (i = 0; i < count; i++) {
+               cur = sector + i;
+               blk = cur / vhd->spb;
+               sec = cur % vhd->spb;
+
+               if (blk != bm_blk) {
+                       bm_blk = blk;
+                       free(buf);
+                       buf = NULL;
+
+                       if (vhd->bat.bat[blk] != DD_BLK_UNUSED) {
+                               err = vhd_read_bitmap(vhd, blk, &buf);
+                               if (err)
+                                       goto out;
+                       }
+               }
+
+               if (vhd->bat.bat[blk] == DD_BLK_UNUSED)
+                       bit = 0;
+               else
+                       bit = vhd_bitmap_test(vhd, buf, blk);
+
+       print:
+               printf("block %s: ", conv(hex, blk));
+               printf("sec: %s: %d\n", conv(hex, sec), bit);
+       }
+
+       err = 0;
+ out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_print_batmap(vhd_context_t *vhd)
+{
+       int err;
+       size_t size;
+
+       err = vhd_get_batmap(vhd);
+       if (err) {
+               printf("failed to read batmap: %d\n", err);
+               return err;
+       }
+
+       size = vhd_sectors_to_bytes(vhd->batmap.header.batmap_size);
+       write_full(STDOUT_FILENO, vhd->batmap.map, size);
+
+       return 0;
+}
+
+static int
+vhd_test_batmap(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       int i, err;
+       uint64_t cur;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       err = vhd_get_batmap(vhd);
+       if (err) {
+               fprintf(stderr, "failed to get batmap\n");
+               return err;
+       }
+
+       for (i = 0; i < count; i++) {
+               cur = block + i;
+               fprintf(stderr, "batmap for block %s: %d\n", conv(hex, cur),
+                       vhd_batmap_test(vhd, &vhd->batmap, cur));
+       }
+
+       return 0;
+}
+
+static int
+vhd_print_data(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       char *buf;
+       int i, err;
+       uint64_t cur;
+
+       err = 0;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       for (i = 0; i < count; i++) {
+               cur = block + i;
+
+               if (vhd->bat.bat[cur] == DD_BLK_UNUSED) {
+                       printf("block %s not allocated\n", conv(hex, cur));
+                       continue;
+               }
+
+               err = vhd_read_block(vhd, cur, &buf);
+               if (err)
+                       break;
+
+               write_full(STDOUT_FILENO, buf, vhd->header.block_size);
+               free(buf);
+       }
+
+       return err;
+}
+
+static int
+vhd_read_data(vhd_context_t *vhd, uint64_t sec, int count, int hex)
+{
+       char *buf;
+       uint64_t cur;
+       int err, max, secs;
+
+       if (vhd_sectors_to_bytes(sec + count) > vhd->footer.curr_size)
+               return -ERANGE;
+
+       max = MIN(vhd_sectors_to_bytes(count), VHD_BLOCK_SIZE);
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, max);
+       if (err)
+               return -err;
+
+       cur = sec;
+       while (count) {
+               secs = MIN((max >> VHD_SECTOR_SHIFT), count);
+               err  = vhd_io_read(vhd, buf, cur, secs);
+               if (err)
+                       break;
+
+               write_full(STDOUT_FILENO, buf, vhd_sectors_to_bytes(secs));
+
+               cur   += secs;
+               count -= secs;
+       }
+
+       free(buf);
+       return err;
+}
+
+int
+vhd_util_read(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       int c, err, headers, hex;
+       uint64_t bat, bitmap, tbitmap, batmap, tbatmap, data, lsec, count, read;
+
+       err     = 0;
+       hex     = 0;
+       headers = 0;
+       count   = 1;
+       bat     = -1;
+       bitmap  = -1;
+       tbitmap = -1;
+       batmap  = -1;
+       tbatmap = -1;
+       data    = -1;
+       lsec    = -1;
+       read    = -1;
+       name    = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:pt:b:m:i:aj:d:c:r:xh")) != -1) {
+               switch(c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'p':
+                       headers = 1;
+                       break;
+               case 't':
+                       lsec = strtoul(optarg, NULL, 10);
+                       break;
+               case 'b':
+                       bat = strtoull(optarg, NULL, 10);
+                       break;
+               case 'm':
+                       bitmap = strtoull(optarg, NULL, 10);
+                       break;
+               case 'i':
+                       tbitmap = strtoul(optarg, NULL, 10);
+                       break;
+               case 'a':
+                       batmap = 1;
+                       break;
+               case 'j':
+                       tbatmap = strtoull(optarg, NULL, 10);
+                       break;
+               case 'd':
+                       data = strtoull(optarg, NULL, 10);
+                       break;
+               case 'r':
+                       read = strtoull(optarg, NULL, 10);
+                       break;
+               case 'c':
+                       count = strtoul(optarg, NULL, 10);
+                       break;
+               case 'x':
+                       hex = 1;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+       if (err) {
+               printf("Failed to open %s: %d\n", name, err);
+               vhd_dump_headers(name, hex);
+               return err;
+       }
+
+       err = vhd_get_bat(&vhd);
+       if (err) {
+               printf("Failed to get bat for %s: %d\n", name, err);
+               goto out;
+       }
+
+       if (headers)
+               vhd_print_headers(&vhd, hex);
+
+       if (lsec != -1) {
+               err = vhd_print_logical_to_physical(&vhd, lsec, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (bat != -1) {
+               err = vhd_print_bat(&vhd, bat, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (bitmap != -1) {
+               err = vhd_print_bitmap(&vhd, bitmap, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (tbitmap != -1) {
+               err = vhd_test_bitmap(&vhd, tbitmap, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (batmap != -1) {
+               err = vhd_print_batmap(&vhd);
+               if (err)
+                       goto out;
+       }
+
+       if (tbatmap != -1) {
+               err = vhd_test_batmap(&vhd, tbatmap, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (data != -1) {
+               err = vhd_print_data(&vhd, data, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (read != -1) {
+               err = vhd_read_data(&vhd, read, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+ out:
+       vhd_close(&vhd);
+       return err;
+
+ usage:
+       printf("options:\n"
+              "-h          help\n"
+              "-n          name\n"
+              "-p          print VHD headers\n"
+              "-t sec      translate logical sector to VHD location\n"
+              "-b blk      print bat entry\n"
+              "-m blk      print bitmap\n"
+              "-i sec      test bitmap for logical sector\n"
+              "-a          print batmap\n"
+              "-j blk      test batmap for block\n"
+              "-d blk      print data\n"
+              "-c num      num units\n"
+              "-r sec      read num sectors at sec\n"
+              "-x          print in hex\n");
+       return EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-repair.c b/tools/blktap2/vhd/lib/vhd-util-repair.c
new file mode 100644 (file)
index 0000000..14ded81
--- /dev/null
@@ -0,0 +1,84 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_repair(int argc, char **argv)
+{
+       char *name;
+       int err, c;
+       off_t eof;
+       vhd_context_t vhd;
+
+       name = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       err = vhd_end_of_data(&vhd, &eof);
+       if (err) {
+               printf("error finding end of data: %d\n", err);
+               goto done;
+       }
+
+       err = vhd_write_footer_at(&vhd, &vhd.footer, eof);
+
+ done:
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-resize.c b/tools/blktap2/vhd/lib/vhd-util-resize.c
new file mode 100644 (file)
index 0000000..c8a9528
--- /dev/null
@@ -0,0 +1,1131 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+
+#include "libvhd-journal.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf(stdout, _f, ##_a)
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define EPRINTF(_f, _a...)                                     \
+       do {                                                    \
+               syslog(LOG_INFO, "%s: " _f, __func__, ##_a);    \
+               DFPRINTF(_f, _a);                               \
+       } while (0)
+
+typedef struct vhd_block {
+       uint32_t block;
+       uint32_t offset;
+} vhd_block_t;
+
+TEST_FAIL_EXTERN_VARS;
+
+static inline uint32_t
+secs_to_blocks_down(vhd_context_t *vhd, uint64_t secs)
+{
+       return secs / vhd->spb;
+}
+
+static uint32_t
+secs_to_blocks_up(vhd_context_t *vhd, uint64_t secs)
+{
+       uint32_t blocks;
+
+       blocks = secs / vhd->spb;
+       if (secs % vhd->spb)
+               blocks++;
+
+       return blocks;
+}
+
+static int
+vhd_fixed_shrink(vhd_journal_t *journal, uint64_t secs)
+{
+       int err;
+       uint64_t new_eof;
+       vhd_context_t *vhd;
+
+       vhd = &journal->vhd;
+
+       new_eof = vhd->footer.curr_size - vhd_sectors_to_bytes(secs);
+       if (new_eof <= sizeof(vhd_footer_t))
+               return -EINVAL;
+
+       err = ftruncate(vhd->fd, new_eof);
+       if (err)
+               return errno;
+
+       vhd->footer.curr_size = new_eof;
+       return vhd_write_footer(vhd, &vhd->footer);
+}
+
+static int
+vhd_write_zeros(vhd_journal_t *journal, off_t off, uint64_t size)
+{
+       int err;
+       char *buf;
+       vhd_context_t *vhd;
+       uint64_t bytes, map;
+
+       vhd = &journal->vhd;
+       map = MIN(size, VHD_BLOCK_SIZE);
+
+       err = vhd_seek(vhd, off, SEEK_SET);
+       if (err)
+               return err;
+
+       buf = mmap(0, map, PROT_READ, MAP_SHARED | MAP_ANON, -1, 0);
+       if (buf == MAP_FAILED)
+               return -errno;
+
+       do {
+               bytes = MIN(size, map);
+
+               err = vhd_write(vhd, buf, bytes);
+               if (err)
+                       break;
+
+               size -= bytes;
+       } while (size);
+
+       munmap(buf, map);
+
+       return err;
+}
+
+static int
+vhd_fixed_grow(vhd_journal_t *journal, uint64_t secs)
+{
+       int err;
+       vhd_context_t *vhd;
+       uint64_t size, eof, new_eof;
+
+       size = vhd_sectors_to_bytes(secs);
+       vhd  = &journal->vhd;
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err)
+               goto out;
+
+       eof = vhd_position(vhd);
+       if (eof == (off_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       err = vhd_write_zeros(journal, eof - sizeof(vhd_footer_t), size);
+       if (err)
+               goto out;
+
+       new_eof = eof + size;
+       err = vhd_seek(vhd, new_eof, SEEK_SET);
+       if (err)
+               goto out;
+
+       vhd->footer.curr_size += size;
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       return err;
+}
+
+static int
+vhd_fixed_resize(vhd_journal_t *journal, uint64_t size)
+{
+       int err;
+       vhd_context_t *vhd;
+       uint64_t cur_secs, new_secs;
+
+       vhd      = &journal->vhd;
+       cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT;
+       new_secs = size << (20 - VHD_SECTOR_SHIFT);
+
+       if (cur_secs == new_secs)
+               return 0;
+       else if (cur_secs > new_secs)
+               err = vhd_fixed_shrink(journal, cur_secs - new_secs);
+       else
+               err = vhd_fixed_grow(journal, new_secs - cur_secs);
+
+       return err;
+}
+
+static inline void
+swap(vhd_block_t *list, int a, int b)
+{
+       vhd_block_t tmp;
+
+       tmp     = list[a];
+       list[a] = list[b];
+       list[b] = tmp;
+}
+
+static int
+partition(vhd_block_t *list, int left, int right, int pidx)
+{
+       int i, sidx;
+       long long pval;
+
+       sidx = left;
+       pval = list[pidx].offset;
+       swap(list, pidx, right);
+
+       for (i = left; i < right; i++)
+               if (list[i].offset >= pval) {
+                       swap(list, sidx, i);
+                       ++sidx;
+               }
+
+       swap(list, right, sidx);
+       return sidx;
+}
+
+static void
+quicksort(vhd_block_t *list, int left, int right)
+{
+       int pidx, new_pidx;
+
+       if (right < left)
+               return;
+
+       pidx     = left;
+       new_pidx = partition(list, left, right, pidx);
+       quicksort(list, left, new_pidx - 1);
+       quicksort(list, new_pidx + 1, right);
+}
+
+static int
+vhd_move_block(vhd_journal_t *journal, uint32_t src, off_t offset)
+{
+       int err;
+       char *buf;
+       size_t size;
+       vhd_context_t *vhd;
+       off_t off, src_off;
+
+       buf     = NULL;
+       vhd     = &journal->vhd;
+       off     = offset;
+       size    = vhd_sectors_to_bytes(vhd->bm_secs);
+       src_off = vhd->bat.bat[src];
+
+       if (src_off == DD_BLK_UNUSED)
+               return -EINVAL;
+       src_off = vhd_sectors_to_bytes(src_off);
+
+       err  = vhd_journal_add_block(journal, src,
+                                    VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA);
+       if (err)
+               goto out;
+
+       err  = vhd_read_bitmap(vhd, src, &buf);
+       if (err)
+               goto out;
+
+       err  = vhd_seek(vhd, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err  = vhd_write(vhd, buf, size);
+       if (err)
+               goto out;
+
+       free(buf);
+       buf   = NULL;
+       off  += size;
+       size  = vhd_sectors_to_bytes(vhd->spb);
+
+       err  = vhd_read_block(vhd, src, &buf);
+       if (err)
+               goto out;
+
+       err  = vhd_seek(vhd, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err  = vhd_write(vhd, buf, size);
+       if (err)
+               goto out;
+
+       vhd->bat.bat[src] = offset >> VHD_SECTOR_SHIFT;
+
+       err = vhd_write_zeros(journal, src_off,
+                             vhd_sectors_to_bytes(vhd->bm_secs + vhd->spb));
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_clobber_block(vhd_journal_t *journal, uint32_t src, uint32_t dest)
+{
+       int err;
+       off_t off;
+       vhd_context_t *vhd;
+
+       vhd = &journal->vhd;
+       off = vhd_sectors_to_bytes(vhd->bat.bat[dest]);
+
+       err = vhd_journal_add_block(journal, dest,
+                                   VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA);
+       if (err)
+               return err;
+
+       err = vhd_move_block(journal, src, off);
+       if (err)
+               return err;
+
+       vhd->bat.bat[dest] = DD_BLK_UNUSED;
+
+       return 0;
+}
+
+/*
+ * remove a list of blocks from the vhd file
+ * if a block to be removed:
+ *   - resides at the end of the file: simply clear its bat entry
+ *   - resides elsewhere: move the last block in the file into its position
+ *                        and update the bat to reflect this
+ */
+static int
+vhd_defrag_shrink(vhd_journal_t *journal,
+                 vhd_block_t *original_free_list, int free_cnt)
+{
+       vhd_context_t *vhd;
+       int i, j, free_idx, err;
+       vhd_block_t *blocks, *free_list;
+
+       err       = 0;
+       blocks    = NULL;
+       free_list = NULL;
+       vhd       = &journal->vhd;
+
+       blocks = malloc(vhd->bat.entries * sizeof(vhd_block_t));
+       if (!blocks) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       free_list = malloc(free_cnt * sizeof(vhd_block_t));
+       if (!free_list) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < vhd->bat.entries; i++) {
+               blocks[i].block  = i;
+               blocks[i].offset = vhd->bat.bat[i];
+       }
+
+       memcpy(free_list, original_free_list,
+              free_cnt * sizeof(vhd_block_t));
+
+       /* sort both the to-free list and the bat list
+        * in order of descending file offset */
+       quicksort(free_list, 0, free_cnt - 1);
+       quicksort(blocks, 0, vhd->bat.entries - 1);
+
+       for (i = 0, free_idx = 0;
+            i < vhd->bat.entries && free_idx < free_cnt; i++) {
+               vhd_block_t *b = blocks + i;
+
+               if (b->offset == DD_BLK_UNUSED)
+                       continue;
+
+               for (j = free_idx; j < free_cnt; j++)
+                       if (b->block == free_list[j].block) {
+                               /* the last block in the file is in the list of
+                                * blocks to remove; no need to shuffle the
+                                * data -- just clear the bat entry */
+                               vhd->bat.bat[free_list[j].block] = DD_BLK_UNUSED;
+                               free_idx++;
+                               continue;
+                       }
+
+               err = vhd_clobber_block(journal, b->block,
+                                       free_list[free_idx++].block);
+               if (err)
+                       goto out;
+       }
+
+       /* clear any bat entries for blocks we did not shuffle */
+       for (i = free_idx; i < free_cnt; i++)
+               vhd->bat.bat[free_list[i].block] = DD_BLK_UNUSED;
+
+out:
+       free(blocks);
+       free(free_list);
+
+       return err;
+}
+
+static int
+vhd_clear_bat_entries(vhd_journal_t *journal, uint32_t entries)
+{
+       int i, err;
+       vhd_context_t *vhd;
+       off_t orig_map_off, new_map_off;
+       uint32_t orig_entries, new_entries;
+
+       vhd          = &journal->vhd;
+       orig_entries = vhd->header.max_bat_size;
+       new_entries  = orig_entries - entries;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_batmap_header_offset(vhd, &orig_map_off);
+               if (err)
+                       return err;
+       }
+
+       /* update header */
+       vhd->header.max_bat_size = new_entries;
+       err = vhd_write_header(vhd, &vhd->header);
+       if (err)
+               return err;
+
+       /* update footer */
+       vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size;
+       vhd->footer.geometry  = vhd_chs(vhd->footer.curr_size);
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       /* update bat -- we don't reclaim space, just clear entries */
+       for (i = new_entries; i < orig_entries; i++)
+               vhd->bat.bat[i] = 0;
+
+       err = vhd_write_bat(vhd, &vhd->bat);
+       if (err)
+               return err;
+
+       /* update this after write_bat so the end of the bat is zeored */
+       vhd->bat.entries = new_entries;
+
+       if (!vhd_has_batmap(vhd))
+               return 0;
+
+       /* zero out old batmap header if new header has moved */
+       err = vhd_batmap_header_offset(vhd, &new_map_off);
+       if (err)
+               return err;
+
+       if (orig_map_off != new_map_off) {
+               size_t size;
+
+               size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+               err = vhd_write_zeros(journal, orig_map_off, size);
+               if (err)
+                       return err;
+       }
+
+       /* update batmap -- clear entries for freed blocks */
+       for (i = new_entries; i < orig_entries; i++)
+               vhd_batmap_clear(vhd, &vhd->batmap, i);
+
+       err = vhd_write_batmap(vhd, &vhd->batmap);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_dynamic_shrink(vhd_journal_t *journal, uint64_t secs)
+{
+       off_t eof;
+       uint32_t blocks;
+       vhd_context_t *vhd;
+       int i, j, err, free_cnt;
+       struct vhd_block *free_list;
+
+       printf("dynamic shrink not fully implemented\n");
+       return -ENOSYS;
+
+       eof       = 0;
+       free_cnt  = 0;
+       free_list = NULL;
+       vhd       = &journal->vhd;
+
+       blocks    = secs_to_blocks_down(vhd, secs);
+       if (blocks == 0)
+               return 0;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_get_batmap(vhd);
+               if (err)
+                       return err;
+       }
+
+       free_list = malloc(blocks * sizeof(struct vhd_block));
+       if (!free_list)
+               return -ENOMEM;
+
+       for (i = vhd->bat.entries - 1, j = 0; i >= 0 && j < blocks; i--, j++) {
+               uint32_t blk = vhd->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       free_list[free_cnt].block  = i;
+                       free_list[free_cnt].offset = blk;
+                       free_cnt++;
+               }
+       }
+
+       if (free_cnt) {
+               err = vhd_defrag_shrink(journal, free_list, free_cnt);
+               if (err)
+                       goto out;
+       }
+
+       err = vhd_clear_bat_entries(journal, blocks);
+       if (err)
+               goto out;
+
+       /* remove data beyond footer */
+       err = vhd_end_of_data(vhd, &eof);
+       if (err)
+               goto out;
+
+       err = ftruncate(vhd->fd, eof + sizeof(vhd_footer_t));
+       if (err) {
+               err = -errno;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(free_list);
+       return err;
+}
+
+static inline void
+vhd_first_data_block(vhd_context_t *vhd, vhd_block_t *block)
+{
+       int i;
+       uint32_t blk;
+
+       memset(block, 0, sizeof(vhd_block_t));
+
+       for (i = 0; i < vhd->bat.entries; i++) {
+               blk = vhd->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       if (!block->offset || blk < block->offset) {
+                               block->block  = i;
+                               block->offset = blk;
+                       }
+               }
+       }
+}
+
+static inline uint32_t
+vhd_next_block_offset(vhd_context_t *vhd)
+{
+       int i;
+       uint32_t blk, end, spp, next;
+
+       next = 0;
+       spp  = getpagesize() >> VHD_SECTOR_SHIFT;
+
+       for (i = 0; i < vhd->bat.entries; i++) {
+               blk = vhd->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       end  = blk + vhd->spb + vhd->bm_secs;
+                       next = MAX(next, end);
+               }
+       }
+
+       return next;
+}
+
+static inline int
+in_range(off_t off, off_t start, off_t size)
+{
+       return (start < off && start + size > off);
+}
+
+#define SKIP_HEADER 0x01
+#define SKIP_BAT    0x02
+#define SKIP_BATMAP 0x04
+#define SKIP_PLOC   0x08
+#define SKIP_DATA   0x10
+
+static inline int
+skip_check(int mode, int type)
+{
+       return mode & type;
+}
+
+static int
+vhd_check_for_clobber(vhd_context_t *vhd, off_t off, int mode)
+{
+       int i, n;
+       char *msg;
+       size_t size;
+       vhd_block_t fb;
+       vhd_parent_locator_t *loc;
+
+       msg = NULL;
+
+       if (!vhd_type_dynamic(vhd))
+               return 0;
+
+       if (off < VHD_SECTOR_SIZE) {
+               msg = "backup footer";
+               goto fail;
+       }
+
+       if (!skip_check(mode, SKIP_HEADER))
+               if (in_range(off,
+                            vhd->footer.data_offset, sizeof(vhd_header_t))) {
+                       msg = "header";
+                       goto fail;
+               }
+
+       if (!skip_check(mode, SKIP_BAT))
+               if (in_range(off, vhd->header.table_offset,
+                            vhd_bytes_padded(vhd->header.max_bat_size *
+                                             sizeof(uint32_t)))) {
+                       msg = "bat";
+                       goto fail;
+               }
+
+       if (!skip_check(mode, SKIP_BATMAP))
+               if (vhd_has_batmap(vhd) &&
+                   in_range(off, vhd->batmap.header.batmap_offset,
+                            vhd_bytes_padded(vhd->batmap.header.batmap_size))) {
+                       msg = "batmap";
+                       goto fail;
+               }
+
+       if (!skip_check(mode, SKIP_PLOC)) {
+               n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+               for (i = 0; i < n; i++) {
+                       loc = vhd->header.loc + i;
+                       if (loc->code == PLAT_CODE_NONE)
+                               continue;
+
+                       size = vhd_parent_locator_size(loc);
+                       if (in_range(off, loc->data_offset, size)) {
+                               msg = "parent locator";
+                               goto fail;
+                       }
+               }
+       }
+
+       if (!skip_check(mode, SKIP_DATA)) {
+               vhd_first_data_block(vhd, &fb);
+               if (fb.offset && in_range(off,
+                                         vhd_sectors_to_bytes(fb.offset),
+                                         VHD_BLOCK_SIZE)) {
+                       msg = "data block";
+                       goto fail;
+               }
+       }
+
+       return 0;
+
+fail:
+       EPRINTF("write to 0x%08"PRIx64" would clobber %s\n", off, msg);
+       return -EINVAL;
+}
+
+/*
+ * take any metadata after the bat (@eob) and shift it
+ */
+static int
+vhd_shift_metadata(vhd_journal_t *journal, off_t eob,
+                  size_t bat_needed, size_t map_needed)
+{
+       int i, n, err;
+       vhd_context_t *vhd;
+       size_t size_needed;
+       char *buf, **locators;
+       vhd_parent_locator_t *loc;
+
+       vhd         = &journal->vhd;
+       size_needed = bat_needed + map_needed;
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+
+       locators = calloc(n, sizeof(char *));
+       if (!locators)
+               return -ENOMEM;
+
+       for (i = 0; i < n; i++) {
+               size_t size;
+
+               loc = vhd->header.loc + i;
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               if (loc->data_offset < eob)
+                       continue;
+
+               size = vhd_parent_locator_size(loc);
+               err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+               if (err) {
+                       err = -err;
+                       buf = NULL;
+                       goto out;
+               }
+
+               err  = vhd_seek(vhd, loc->data_offset, SEEK_SET);
+               if (err)
+                       goto out;
+
+               err  = vhd_read(vhd, buf, size);
+               if (err)
+                       goto out;
+
+               locators[i] = buf;
+       }
+
+       for (i = 0; i < n; i++) {
+               off_t off;
+               size_t size;
+
+               if (!locators[i])
+                       continue;
+
+               loc  = vhd->header.loc + i;
+               off  = loc->data_offset + size_needed;
+               size = vhd_parent_locator_size(loc);
+
+               if (vhd_check_for_clobber(vhd, off + size, SKIP_PLOC)) {
+                       EPRINTF("%s: shifting locator %d would clobber data\n",
+                               vhd->file, i);
+                       return -EINVAL;
+               }
+
+               err  = vhd_seek(vhd, off, SEEK_SET);
+               if (err)
+                       goto out;
+
+               err  = vhd_write(vhd, locators[i], size);
+               if (err)
+                       goto out;
+
+               free(locators[i]);
+               locators[i]      = NULL;
+               loc->data_offset = off;
+
+               /* write the new header after writing the new bat */
+       }
+
+       if (vhd_has_batmap(vhd) && vhd->batmap.header.batmap_offset > eob) {
+               vhd->batmap.header.batmap_offset += bat_needed;
+
+               /* write the new batmap after writing the new bat */
+       }
+
+       err = 0;
+
+out:
+       for (i = 0; i < n; i++)
+               free(locators[i]);
+       free(locators);
+
+       return err;
+}
+
+static int
+vhd_add_bat_entries(vhd_journal_t *journal, int entries)
+{
+       int i, err;
+       off_t off;
+       vhd_bat_t new_bat;
+       vhd_context_t *vhd;
+       uint32_t new_entries;
+       vhd_batmap_t new_batmap;
+       uint64_t bat_size, new_bat_size, map_size, new_map_size;
+
+       vhd          = &journal->vhd;
+       new_entries  = vhd->header.max_bat_size + entries;
+
+       bat_size     = vhd_bytes_padded(vhd->header.max_bat_size *
+                                       sizeof(uint32_t));
+       new_bat_size = vhd_bytes_padded(new_entries * sizeof(uint32_t));
+
+       map_size     = vhd_bytes_padded((vhd->header.max_bat_size + 7) >> 3);
+       new_map_size = vhd_bytes_padded((new_entries + 7) >> 3);
+
+       off = vhd->header.table_offset + new_bat_size;
+       if (vhd_check_for_clobber(vhd, off, SKIP_BAT | SKIP_BATMAP)) {
+               EPRINTF("%s: writing new bat of 0x%"PRIx64" bytes "
+                       "at 0x%08"PRIx64" would clobber data\n", 
+                       vhd->file, new_bat_size, vhd->header.table_offset);
+               return -EINVAL;
+       }
+
+       if (vhd_has_batmap(vhd)) {
+               off = vhd->batmap.header.batmap_offset + new_map_size;
+               if (vhd_check_for_clobber(vhd, off, 0)) {
+                       EPRINTF("%s: writing new batmap of 0x%"PRIx64" bytes"
+                               " at 0x%08"PRIx64" would clobber data\n", vhd->file,
+                               new_map_size, vhd->batmap.header.batmap_offset);
+                       return -EINVAL;
+               }
+       }
+
+       /* update header */
+       vhd->header.max_bat_size = new_entries;
+       err = vhd_write_header(vhd, &vhd->header);
+       if (err)
+               return err;
+
+       /* update footer */
+       vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size;
+       vhd->footer.geometry  = vhd_chs(vhd->footer.curr_size);
+       vhd->footer.checksum  = vhd_checksum_footer(&vhd->footer);
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       /* allocate new bat */
+       err = posix_memalign((void **)&new_bat.bat, VHD_SECTOR_SIZE, new_bat_size);
+       if (err)
+               return -err;
+
+       new_bat.spb     = vhd->bat.spb;
+       new_bat.entries = new_entries;
+       memcpy(new_bat.bat, vhd->bat.bat, bat_size);
+       for (i = vhd->bat.entries; i < new_entries; i++)
+               new_bat.bat[i] = DD_BLK_UNUSED;
+
+       /* write new bat */
+       err = vhd_write_bat(vhd, &new_bat);
+       if (err) {
+               free(new_bat.bat);
+               return err;
+       }
+
+       /* update in-memory bat */
+       free(vhd->bat.bat);
+       vhd->bat = new_bat;
+
+       if (!vhd_has_batmap(vhd))
+               return 0;
+
+       /* allocate new batmap */
+       err = posix_memalign((void **)&new_batmap.map,
+                            VHD_SECTOR_SIZE, new_map_size);
+       if (err)
+               return err;
+
+       new_batmap.header = vhd->batmap.header;
+       new_batmap.header.batmap_size = secs_round_up_no_zero(new_map_size);
+       memcpy(new_batmap.map, vhd->batmap.map, map_size);
+       memset(new_batmap.map + map_size, 0, new_map_size - map_size);
+
+       /* write new batmap */
+       err = vhd_write_batmap(vhd, &new_batmap);
+       if (err) {
+               free(new_batmap.map);
+               return err;
+       }
+
+       /* update in-memory batmap */
+       free(vhd->batmap.map);
+       vhd->batmap = new_batmap;
+
+       return 0;
+}
+
+static int
+vhd_dynamic_grow(vhd_journal_t *journal, uint64_t secs)
+{
+       int i, err;
+       off_t eob, eom;
+       vhd_context_t *vhd;
+       vhd_block_t first_block;
+       uint64_t blocks, size_needed;
+       uint64_t bat_needed, bat_size, bat_avail, bat_bytes, bat_secs;
+       uint64_t map_needed, map_size, map_avail, map_bytes, map_secs;
+
+       vhd         = &journal->vhd;
+
+       size_needed = 0;
+       bat_needed  = 0;
+       map_needed  = 0;
+
+       /* number of vhd blocks to add */
+       blocks      = secs_to_blocks_up(vhd, secs);
+
+       /* size in bytes needed for new bat entries */
+       bat_needed  = blocks * sizeof(uint32_t);
+       map_needed  = (blocks >> 3) + 1;
+
+       /* available bytes in current bat */
+       bat_bytes   = vhd->header.max_bat_size * sizeof(uint32_t);
+       bat_secs    = secs_round_up_no_zero(bat_bytes);
+       bat_size    = vhd_sectors_to_bytes(bat_secs);
+       bat_avail   = bat_size - bat_bytes;
+
+       if (vhd_has_batmap(vhd)) {
+               /* avaliable bytes in current batmap */
+               map_bytes   = (vhd->header.max_bat_size + 7) >> 3;
+               map_secs    = vhd->batmap.header.batmap_size;
+               map_size    = vhd_sectors_to_bytes(map_secs);
+               map_avail   = map_size - map_bytes;
+       } else {
+               map_needed  = 0;
+               map_avail   = 0;
+       }
+
+       /* we have enough space already; just extend the bat */
+       if (bat_needed <= bat_avail && map_needed <= map_avail)
+               goto add_entries;
+
+       /* we need to add new sectors to the bat */
+       if (bat_needed > bat_avail) {
+               bat_needed -= bat_avail;
+               bat_needed  = vhd_bytes_padded(bat_needed);
+       } else
+               bat_needed  = 0;
+
+       /* we need to add new sectors to the batmap */
+       if (map_needed > map_avail) {
+               map_needed -= map_avail;
+               map_needed  = vhd_bytes_padded(map_needed);
+       } else
+               map_needed  = 0;
+
+       /* how many additional bytes do we need? */
+       size_needed = bat_needed + map_needed;
+
+       /* calculate space between end of headers and beginning of data */
+       err = vhd_end_of_headers(vhd, &eom);
+       if (err)
+               return err;
+
+       eob = vhd->header.table_offset + vhd_sectors_to_bytes(bat_secs);
+       vhd_first_data_block(vhd, &first_block);
+
+       /* no blocks allocated; just shift post-bat metadata */
+       if (!first_block.offset)
+               goto shift_metadata;
+
+       /* 
+        * not enough space -- 
+        * move vhd data blocks to the end of the file to make room 
+        */
+       do {
+               off_t new_off, bm_size, gap_size;
+
+               new_off = vhd_sectors_to_bytes(vhd_next_block_offset(vhd));
+
+               /* data region of segment should begin on page boundary */
+               bm_size = vhd_sectors_to_bytes(vhd->bm_secs);
+               if ((new_off + bm_size) % 4096) {
+                       gap_size = 4096 - ((new_off + bm_size) % 4096);
+
+                       err = vhd_write_zeros(journal, new_off, gap_size);
+                       if (err)
+                               return err;
+
+                       new_off += gap_size;
+               }
+
+               err = vhd_move_block(journal, first_block.block, new_off);
+               if (err)
+                       return err;
+
+               vhd_first_data_block(vhd, &first_block);
+
+       } while (eom + size_needed >= vhd_sectors_to_bytes(first_block.offset));
+
+       TEST_FAIL_AT(FAIL_RESIZE_DATA_MOVED);
+
+shift_metadata:
+       /* shift any metadata after the bat to make room for new bat sectors */
+       err = vhd_shift_metadata(journal, eob, bat_needed, map_needed);
+       if (err)
+               return err;
+
+       TEST_FAIL_AT(FAIL_RESIZE_METADATA_MOVED);
+
+add_entries:
+       return vhd_add_bat_entries(journal, blocks);
+}
+
+static int
+vhd_dynamic_resize(vhd_journal_t *journal, uint64_t size)
+{
+       int err;
+       vhd_context_t *vhd;
+       uint64_t cur_secs, new_secs;
+
+       vhd      = &journal->vhd;
+       cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT;
+       new_secs = size << (20 - VHD_SECTOR_SHIFT);
+
+       if (cur_secs == new_secs)
+               return 0;
+
+       err = vhd_get_header(vhd);
+       if (err)
+               return err;
+
+       err = vhd_get_bat(vhd);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_get_batmap(vhd);
+               if (err)
+                       return err;
+       }
+
+       if (cur_secs > new_secs)
+               err = vhd_dynamic_shrink(journal, cur_secs - new_secs);
+       else
+               err = vhd_dynamic_grow(journal, new_secs - cur_secs);
+
+       return err;
+}
+
+static int
+vhd_util_resize_check_creator(const char *name)
+{
+       int err;
+       vhd_context_t vhd;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_STRICT);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       if (!vhd_creator_tapdisk(&vhd)) {
+               printf("%s not created by xen; resize not supported\n", name);
+               err = -EINVAL;
+       }
+
+       vhd_close(&vhd);
+       return err;
+}
+
+int
+vhd_util_resize(int argc, char **argv)
+{
+       char *name, *jname;
+       uint64_t size;
+       int c, err, jerr;
+       vhd_journal_t journal;
+       vhd_context_t *vhd;
+
+       err   = -EINVAL;
+       size  = 0;
+       name  = NULL;
+       jname = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:j:s:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'j':
+                       jname = optarg;
+                       break;
+               case 's':
+                       err  = 0;
+                       size = strtoull(optarg, NULL, 10);
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (err || !name || !jname || argc != optind)
+               goto usage;
+
+       err = vhd_util_resize_check_creator(name);
+       if (err)
+               return err;
+
+       libvhd_set_log_level(1);
+       err = vhd_journal_create(&journal, name, jname);
+       if (err) {
+               printf("creating journal failed: %d\n", err);
+               return err;
+       }
+
+       vhd = &journal.vhd;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               goto out;
+
+       TEST_FAIL_AT(FAIL_RESIZE_BEGIN);
+
+       if (vhd_type_dynamic(vhd))
+               err = vhd_dynamic_resize(&journal, size);
+       else
+               err = vhd_fixed_resize(&journal, size);
+
+       TEST_FAIL_AT(FAIL_RESIZE_END);
+
+out:
+       if (err) {
+               printf("resize failed: %d\n", err);
+               jerr = vhd_journal_revert(&journal);
+       } else
+               jerr = vhd_journal_commit(&journal);
+
+       if (jerr) {
+               printf("closing journal failed: %d\n", jerr);
+               vhd_journal_close(&journal);
+       } else
+               vhd_journal_remove(&journal);
+
+       return (err ? : jerr);
+
+usage:
+       printf("options: <-n name> <-j journal> <-s size (in MB)> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-revert.c b/tools/blktap2/vhd/lib/vhd-util-revert.c
new file mode 100644 (file)
index 0000000..dab6e8b
--- /dev/null
@@ -0,0 +1,106 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Altering operations:
+ *
+ * 1. Change the parent pointer to another file.
+ * 2. Change the size of the file containing the VHD image. This does NOT 
+ * affect the VHD disk capacity, only the physical size of the file containing 
+ * the VHD. Naturally, it is not possible to set the file size to be less than  
+ * the what VHD utilizes.
+ * The operation doesn't actually change the file size, but it writes the 
+ * footer in the right location such that resizing the file (manually, as a 
+ * separate step) will produce the correct results. If the new file size is 
+ * greater than the current file size, the file must first be expanded and then 
+ * altered with this operation. If the new size is smaller than the current 
+ * size, the VHD must first be altered with this operation and then the file 
+ * must be shrunk. Failing to resize the file will result in a corrupted VHD.
+*/
+
+#include <errno.h>
+//#include <fcntl.h>
+#include <stdio.h>
+//#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+#include "libvhd-journal.h"
+
+int
+vhd_util_revert(int argc, char **argv)
+{
+       char *name, *jname;
+       vhd_journal_t journal;
+       int c, err;
+
+       name  = NULL;
+       jname = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:j:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'j':
+                       jname = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || !jname || argc != optind)
+               goto usage;
+
+       libvhd_set_log_level(1);
+       err = vhd_journal_open(&journal, name, jname);
+       if (err) {
+               printf("opening journal failed: %d\n", err);
+               return err;
+       }
+
+       err = vhd_journal_revert(&journal);
+       if (err) {
+               printf("reverting journal failed: %d\n", err);
+               vhd_journal_close(&journal);
+               return err;
+       }
+
+       err = vhd_journal_remove(&journal);
+       if (err) {
+               printf("removing journal failed: %d\n", err);
+               vhd_journal_close(&journal);
+               return err;
+       }
+
+       return 0;
+
+usage:
+       printf("options: <-n name> <-j journal> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-scan.c b/tools/blktap2/vhd/lib/vhd-util-scan.c
new file mode 100644 (file)
index 0000000..e87a7ea
--- /dev/null
@@ -0,0 +1,1317 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <glob.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <libgen.h>    /* for basename() */
+#include <sys/stat.h>
+
+#include "list.h"
+#include "libvhd.h"
+#include "lvm-util.h"
+
+#define VHD_SCAN_FAST        0x01
+#define VHD_SCAN_PRETTY      0x02
+#define VHD_SCAN_VOLUME      0x04
+#define VHD_SCAN_NOFAIL      0x08
+#define VHD_SCAN_VERBOSE     0x10
+#define VHD_SCAN_PARENTS     0x20
+
+#define VHD_TYPE_RAW_FILE    0x01
+#define VHD_TYPE_VHD_FILE    0x02
+#define VHD_TYPE_RAW_VOLUME  0x04
+#define VHD_TYPE_VHD_VOLUME  0x08
+
+static inline int
+target_volume(uint8_t type)
+{
+       return (type == VHD_TYPE_RAW_VOLUME || type == VHD_TYPE_VHD_VOLUME);
+}
+
+static inline int
+target_vhd(uint8_t type)
+{
+       return (type == VHD_TYPE_VHD_FILE || type == VHD_TYPE_VHD_VOLUME);
+}
+
+struct target {
+       char                 name[VHD_MAX_NAME_LEN];
+       char                 device[VHD_MAX_NAME_LEN];
+       uint64_t             size;
+       uint64_t             start;
+       uint64_t             end;
+       uint8_t              type;
+};
+
+struct iterator {
+       int                  cur;
+       int                  cur_size;
+       int                  max_size;
+       struct target       *targets;
+};
+
+struct vhd_image {
+       char                *name;
+       char                *parent;
+       uint64_t             capacity;
+       off_t                size;
+       uint8_t              hidden;
+       int                  error;
+       char                *message;
+
+       struct target       *target;
+
+       struct list_head     sibling;
+       struct list_head     children;
+       struct vhd_image    *parent_image;
+};
+
+struct vhd_scan {
+       int                  cur;
+       int                  size;
+
+       int                  lists_cur;
+       int                  lists_size;
+
+       struct vhd_image   **images;
+       struct vhd_image   **lists;
+};
+
+static int flags;
+static struct vg vg;
+static struct vhd_scan scan;
+
+static int
+vhd_util_scan_pretty_allocate_list(int cnt)
+{
+       int i;
+       struct vhd_image *list;
+
+       memset(&scan, 0, sizeof(scan));
+
+       scan.lists_cur  = 1;
+       scan.lists_size = 10;
+
+       scan.lists = calloc(scan.lists_size, sizeof(struct vhd_image *));
+       if (!scan.lists)
+               goto fail;
+
+       scan.lists[0] = calloc(cnt, sizeof(struct vhd_image));
+       if (!scan.lists[0])
+               goto fail;
+
+       scan.images = calloc(cnt, sizeof(struct vhd_image *));
+       if (!scan.images)
+               goto fail;
+
+       for (i = 0; i < cnt; i++)
+               scan.images[i] = scan.lists[0] + i;
+
+       scan.cur  = 0;
+       scan.size = cnt;
+
+       return 0;
+
+fail:
+       if (scan.lists) {
+               free(scan.lists[0]);
+               free(scan.lists);
+       }
+
+       free(scan.images);
+       memset(&scan, 0, sizeof(scan));
+       return -ENOMEM;
+}
+
+static void
+vhd_util_scan_pretty_free_list(void)
+{
+       int i;
+
+       if (scan.lists) {
+               for (i = 0; i < scan.lists_cur; i++)
+                       free(scan.lists[i]);
+               free(scan.lists);
+       }
+
+       free(scan.images);
+       memset(&scan, 0, sizeof(scan));
+}
+
+static int
+vhd_util_scan_pretty_add_image(struct vhd_image *image)
+{
+       int i;
+       struct vhd_image *img;
+
+       for (i = 0; i < scan.cur; i++) {
+               img = scan.images[i];
+               if (!strcmp(img->name, image->name))
+                       return 0;
+       }
+
+       if (scan.cur >= scan.size) {
+               struct vhd_image *new, **list;
+
+               if (scan.lists_cur >= scan.lists_size) {
+                       list = realloc(scan.lists, scan.lists_size * 2 *
+                                      sizeof(struct vhd_image *));
+                       if (!list)
+                               return -ENOMEM;
+
+                       scan.lists_size *= 2;
+                       scan.lists       = list;
+               }
+
+               new = calloc(scan.size, sizeof(struct vhd_image));
+               if (!new)
+                       return -ENOMEM;
+
+               scan.lists[scan.lists_cur++] = new;
+               scan.size *= 2;
+
+               list = realloc(scan.images, scan.size *
+                              sizeof(struct vhd_image *));
+               if (!list)
+                       return -ENOMEM;
+
+               scan.images = list;
+               for (i = 0; i + scan.cur < scan.size; i++)
+                       scan.images[i + scan.cur] = new + i;
+       }
+
+       img = scan.images[scan.cur];
+       INIT_LIST_HEAD(&img->sibling);
+       INIT_LIST_HEAD(&img->children);
+
+       img->capacity = image->capacity;
+       img->size     = image->size;
+       img->hidden   = image->hidden;
+       img->error    = image->error;
+       img->message  = image->message;
+
+       img->name = strdup(image->name);
+       if (!img->name)
+               goto fail;
+
+       if (image->parent) {
+               img->parent = strdup(image->parent);
+               if (!img->parent)
+                       goto fail;
+       }
+
+       scan.cur++;
+       return 0;
+
+fail:
+       free(img->name);
+       free(img->parent);
+       memset(img, 0, sizeof(*img));
+       return -ENOMEM;
+}
+
+static int
+vhd_util_scan_pretty_image_compare(const void *lhs, const void *rhs)
+{
+       struct vhd_image *l, *r;
+
+       l = *(struct vhd_image **)lhs;
+       r = *(struct vhd_image **)rhs;
+
+       return strcmp(l->name, r->name);
+}
+
+static void
+vhd_util_scan_print_image_indent(struct vhd_image *image, int tab)
+{
+       char *pad, *name, *pmsg, *parent;
+
+       pad    = (tab ? " " : "");
+       name   = image->name;
+       parent = (image->parent ? : "none");
+
+       if ((flags & VHD_SCAN_PRETTY) && image->parent && !image->parent_image)
+               pmsg = " (not found in scan)";
+       else
+               pmsg = "";
+
+       if (!(flags & VHD_SCAN_VERBOSE)) {
+               name = basename(image->name);
+               if (image->parent)
+                       parent = basename(image->parent);
+       }
+
+       if (image->error)
+               printf("%*svhd=%s scan-error=%d error-message='%s'\n",
+                      tab, pad, image->name, image->error, image->message);
+       else
+               printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u "
+                      "parent=%s%s\n", tab, pad, name, image->capacity,
+                      image->size, image->hidden, parent, pmsg);
+}
+
+static void
+vhd_util_scan_pretty_print_tree(struct vhd_image *image, int depth)
+{
+       struct vhd_image *img, *tmp;
+
+       vhd_util_scan_print_image_indent(image, depth * 3);
+
+       list_for_each_entry_safe(img, tmp, &image->children, sibling)
+               if (!img->hidden)
+                       vhd_util_scan_pretty_print_tree(img, depth + 1);
+
+       list_for_each_entry_safe(img, tmp, &image->children, sibling)
+               if (img->hidden)
+                       vhd_util_scan_pretty_print_tree(img, depth + 1);
+
+       free(image->name);
+       free(image->parent);
+
+       image->name   = NULL;
+       image->parent = NULL;
+}
+
+static void
+vhd_util_scan_pretty_print_images(void)
+{
+       int i;
+       struct vhd_image *image, **parentp, *parent, *keyp, key;
+
+       qsort(scan.images, scan.cur, sizeof(scan.images[0]),
+             vhd_util_scan_pretty_image_compare);
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (!image->parent) {
+                       image->parent_image = NULL;
+                       continue;
+               }
+
+               memset(&key, 0, sizeof(key));
+               key.name = image->parent;
+               keyp     = &key;
+
+               parentp  = bsearch(&keyp, scan.images, scan.cur,
+                                  sizeof(scan.images[0]),
+                                  vhd_util_scan_pretty_image_compare);
+               if (!parentp) {
+                       image->parent_image = NULL;
+                       continue;
+               }
+
+               parent = *parentp;
+               image->parent_image = parent;
+               list_add_tail(&image->sibling, &parent->children);
+       }
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (image->parent_image || !image->hidden)
+                       continue;
+
+               vhd_util_scan_pretty_print_tree(image, 0);
+       }
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (!image->name || image->parent_image)
+                       continue;
+
+               vhd_util_scan_pretty_print_tree(image, 0);
+       }
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (!image->name)
+                       continue;
+
+               vhd_util_scan_pretty_print_tree(image, 0);
+       }
+}
+
+static void
+vhd_util_scan_print_image(struct vhd_image *image)
+{
+       int err;
+
+       if (!image->error && (flags & VHD_SCAN_PRETTY)) {
+               err = vhd_util_scan_pretty_add_image(image);
+               if (!err)
+                       return;
+
+               if (!image->error) {
+                       image->error   = err;
+                       image->message = "allocating memory";
+               }
+       }
+
+       vhd_util_scan_print_image_indent(image, 0);
+}
+
+static int
+vhd_util_scan_error(const char *file, int err)
+{
+       struct vhd_image image;
+
+       memset(&image, 0, sizeof(image));
+       image.name    = (char *)file;
+       image.error   = err;
+       image.message = "failure scanning target";
+
+       vhd_util_scan_print_image(&image);
+
+       /*
+       if (flags & VHD_SCAN_NOFAIL)
+               return 0;
+       */
+
+       return err;
+}
+
+static vhd_parent_locator_t *
+vhd_util_scan_get_parent_locator(vhd_context_t *vhd)
+{
+       int i;
+       vhd_parent_locator_t *loc;
+
+       loc = NULL;
+
+       for (i = 0; i < 8; i++) {
+               if (vhd->header.loc[i].code == PLAT_CODE_MACX) {
+                       loc = vhd->header.loc + i;
+                       break;
+               }
+
+               if (vhd->header.loc[i].code == PLAT_CODE_W2RU)
+                       loc = vhd->header.loc + i;
+
+               if (!loc && vhd->header.loc[i].code != PLAT_CODE_NONE)
+                       loc = vhd->header.loc + i;
+       }
+
+       return loc;
+}
+
+static inline int
+copy_name(char *dst, const char *src)
+{
+       if (snprintf(dst, VHD_MAX_NAME_LEN, "%s", src) < VHD_MAX_NAME_LEN)
+               return 0;
+
+       return -ENAMETOOLONG;
+}
+
+/*
+ * LVHD stores realpath(parent) in parent locators, so
+ * /dev/<vol-group>/<lv-name> becomes /dev/mapper/<vol--group>-<lv--name>
+ */
+static int
+vhd_util_scan_extract_volume_name(char *dst, const char *src)
+{
+       int err;
+       char copy[VHD_MAX_NAME_LEN], *name, *s, *c;
+
+       name = strrchr(src, '/');
+       if (!name)
+               name = (char *)src;
+
+       /* convert single dashes to slashes, double dashes to single dashes */
+       for (c = copy, s = name; *s != '\0'; s++, c++) {
+               if (*s == '-') {
+                       if (s[1] != '-')
+                               *c = '/';
+                       else {
+                               s++;
+                               *c = '-';
+                       }
+               } else
+                       *c = *s;
+       }
+
+       *c = '\0';
+       c = strrchr(copy, '/');
+       if (c == name) {
+               /* unrecognized format */
+               strcpy(dst, src);
+               return -EINVAL;
+       }
+
+       strcpy(dst, ++c);
+       return 0;
+}
+
+static int
+vhd_util_scan_get_volume_parent(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       char name[VHD_MAX_NAME_LEN];
+       vhd_parent_locator_t *loc, copy;
+
+       if (flags & VHD_SCAN_FAST) {
+               err = vhd_header_decode_parent(vhd,
+                                              &vhd->header, &image->parent);
+               if (!err)
+                       goto found;
+       }
+
+       loc = vhd_util_scan_get_parent_locator(vhd);
+       if (!loc)
+               return -EINVAL;
+
+       copy = *loc;
+       copy.data_offset += image->target->start;
+       err = vhd_parent_locator_read(vhd, &copy, &image->parent);
+       if (err)
+               return err;
+
+found:
+       err = vhd_util_scan_extract_volume_name(name, image->parent);
+       if (!err)
+               return copy_name(image->parent, name);
+
+       return 0;
+}
+
+static int
+vhd_util_scan_get_parent(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int i, err;
+       vhd_parent_locator_t *loc;
+
+       if (!target_vhd(image->target->type)) {
+               image->parent = NULL;
+               return 0;
+       }
+
+       loc = NULL;
+
+       if (target_volume(image->target->type))
+               return vhd_util_scan_get_volume_parent(vhd, image);
+
+       if (flags & VHD_SCAN_FAST) {
+               err = vhd_header_decode_parent(vhd,
+                                              &vhd->header, &image->parent);
+               if (!err)
+                       return 0;
+       } else {
+               /*
+                * vhd_parent_locator_get checks for the existence of the 
+                * parent file. if this call succeeds, all is well; if not,
+                * we'll try to return whatever string we have before failing
+                * outright.
+                */
+               err = vhd_parent_locator_get(vhd, &image->parent);
+               if (!err)
+                       return 0;
+       }
+
+       loc = vhd_util_scan_get_parent_locator(vhd);
+       if (!loc)
+               return -EINVAL;
+
+       return vhd_parent_locator_read(vhd, loc, &image->parent);
+}
+
+static int
+vhd_util_scan_get_hidden(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err, hidden;
+
+       err    = 0;
+       hidden = 0;
+
+       if (target_vhd(image->target->type))
+               err = vhd_hidden(vhd, &hidden);
+       else
+               hidden = 1;
+
+       if (err)
+               return err;
+
+       image->hidden = hidden;
+       return 0;
+}
+
+static int
+vhd_util_scan_get_size(vhd_context_t *vhd, struct vhd_image *image)
+{
+       image->size = image->target->size;
+
+       if (target_vhd(image->target->type))
+               image->capacity = vhd->footer.curr_size;
+       else
+               image->capacity = image->size;
+
+       return 0;
+}
+
+static int
+vhd_util_scan_open_file(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err, vhd_flags;
+
+       if (!target_vhd(image->target->type))
+               return 0;
+
+       vhd_flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED;
+       if (flags & VHD_SCAN_FAST)
+               vhd_flags |= VHD_OPEN_FAST;
+
+       err = vhd_open(vhd, image->name, vhd_flags);
+       if (err) {
+               vhd->file      = NULL;
+               image->message = "opening file";
+               image->error   = err;
+               return image->error;
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_scan_read_volume_headers(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       char *buf;
+       size_t size;
+       struct target *target;
+
+       buf    = NULL;
+       target = image->target;
+       size   = sizeof(vhd_footer_t) + sizeof(vhd_header_t);
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               buf            = NULL;
+               image->message = "allocating image";
+               image->error   = -err;
+               goto out;
+       }
+
+       err = vhd_seek(vhd, target->start, SEEK_SET);
+       if (err) {
+               image->message = "seeking to headers";
+               image->error   = err;
+               goto out;
+       }
+
+       err = vhd_read(vhd, buf, size);
+       if (err) {
+               image->message = "reading headers";
+               image->error   = err;
+               goto out;
+       }
+
+       memcpy(&vhd->footer, buf, sizeof(vhd_footer_t));
+       vhd_footer_in(&vhd->footer);
+       err = vhd_validate_footer(&vhd->footer);
+       if (err) {
+               image->message = "invalid footer";
+               image->error   = err;
+               goto out;
+       }
+
+       /* lvhd vhds should always be dynamic */
+       if (vhd_type_dynamic(vhd)) {
+               if (vhd->footer.data_offset != sizeof(vhd_footer_t))
+                       err = vhd_read_header_at(vhd, &vhd->header,
+                                                vhd->footer.data_offset +
+                                                target->start);
+               else {
+                       memcpy(&vhd->header,
+                              buf + sizeof(vhd_footer_t),
+                              sizeof(vhd_header_t));
+                       vhd_header_in(&vhd->header);
+                       err = vhd_validate_header(&vhd->header);
+               }
+
+               if (err) {
+                       image->message = "reading header";
+                       image->error   = err;
+                       goto out;
+               }
+
+               vhd->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT;
+               vhd->bm_secs = secs_round_up_no_zero(vhd->spb >> 3);
+       }
+
+out:
+       free(buf);
+       return image->error;
+}
+
+static int
+vhd_util_scan_open_volume(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       struct target *target;
+
+       target = image->target;
+       memset(vhd, 0, sizeof(*vhd));
+       vhd->oflags = VHD_OPEN_RDONLY | VHD_OPEN_FAST;
+
+       if (target->end - target->start < 4096) {
+               image->message = "device too small";
+               image->error   = -EINVAL;
+               return image->error;
+       }
+
+       vhd->file = strdup(image->name);
+       if (!vhd->file) {
+               image->message = "allocating device";
+               image->error   = -ENOMEM;
+               return image->error;
+       }
+
+       vhd->fd = open(target->device, O_RDONLY | O_DIRECT | O_LARGEFILE);
+       if (vhd->fd == -1) {
+               free(vhd->file);
+               vhd->file = NULL;
+
+               image->message = "opening device";
+               image->error   = -errno;
+               return image->error;
+       }
+
+       if (target_vhd(target->type))
+               return vhd_util_scan_read_volume_headers(vhd, image);
+
+       return 0;
+}
+
+static int
+vhd_util_scan_open(vhd_context_t *vhd, struct vhd_image *image)
+{
+       struct target *target;
+
+       target = image->target;
+
+       if (target_volume(image->target->type) || !(flags & VHD_SCAN_PRETTY))
+               image->name = target->name;
+       else {
+               image->name = realpath(target->name, NULL);
+               if (!image->name) {
+                       image->name    = target->name;
+                       image->message = "resolving name";
+                       image->error   = -errno;
+                       return image->error;
+               }
+       }
+
+       if (target_volume(target->type))
+               return vhd_util_scan_open_volume(vhd, image);
+       else
+               return vhd_util_scan_open_file(vhd, image);
+}
+
+static int
+vhd_util_scan_init_file_target(struct target *target,
+                              const char *file, uint8_t type)
+{
+       int err;
+       struct stat stats;
+
+       err = stat(file, &stats);
+       if (err == -1)
+               return -errno;
+
+       err = copy_name(target->name, file);
+       if (err)
+               return err;
+
+       err = copy_name(target->device, file);
+       if (err)
+               return err;
+
+       target->type  = type;
+       target->start = 0;
+       target->size  = stats.st_size;
+       target->end   = stats.st_size;
+
+       return 0;
+}
+
+static int
+vhd_util_scan_init_volume_target(struct target *target,
+                                struct lv *lv, uint8_t type)
+{
+       int err;
+
+       if (lv->first_segment.type != LVM_SEG_TYPE_LINEAR)
+               return -ENOSYS;
+
+       err = copy_name(target->name, lv->name);
+       if (err)
+               return err;
+
+       err = copy_name(target->device, lv->first_segment.device);
+       if (err)
+               return err;
+
+       target->type  = type;
+       target->size  = lv->size;
+       target->start = lv->first_segment.pe_start;
+       target->end   = target->start + lv->first_segment.pe_size;
+
+       return 0;
+}
+
+static int
+iterator_init(struct iterator *itr, int cnt, struct target *targets)
+{
+       memset(itr, 0, sizeof(*itr));
+
+       itr->targets = malloc(sizeof(struct target) * cnt);
+       if (!itr->targets)
+               return -ENOMEM;
+
+       memcpy(itr->targets, targets, sizeof(struct target) * cnt);
+
+       itr->cur      = 0;
+       itr->cur_size = cnt;
+       itr->max_size = cnt;
+
+       return 0;
+}
+
+static struct target *
+iterator_next(struct iterator *itr)
+{
+       if (itr->cur == itr->cur_size)
+               return NULL;
+
+       return itr->targets + itr->cur++;
+}
+
+static int
+iterator_add_file(struct iterator *itr,
+                 struct target *target, const char *parent, uint8_t type)
+{
+       int i;
+       struct target *t;
+       char *lname, *rname;
+
+       for (i = 0; i < itr->cur_size; i++) {
+               t = itr->targets + i;
+               lname = basename((char *)t->name);
+               rname = basename((char *)parent);
+
+               if (!strcmp(lname, rname))
+                       return -EEXIST;
+       }
+
+       return vhd_util_scan_init_file_target(target, parent, type);
+}
+
+static int
+iterator_add_volume(struct iterator *itr,
+                   struct target *target, const char *parent, uint8_t type)
+{
+       int i, err;
+       struct lv *lv;
+
+       lv  = NULL;
+       err = -ENOENT;
+
+       for (i = 0; i < itr->cur_size; i++)
+               if (!strcmp(parent, itr->targets[i].name))
+                       return -EEXIST;
+
+       for (i = 0; i < vg.lv_cnt; i++) {
+               err = fnmatch(parent, vg.lvs[i].name, FNM_PATHNAME);
+               if (err != FNM_NOMATCH) {
+                       lv = vg.lvs + i;
+                       break;
+               }
+       }
+
+       if (err && err != FNM_PATHNAME)
+               return err;
+
+       if (!lv)
+               return -ENOENT;
+
+       return vhd_util_scan_init_volume_target(target, lv, type);
+}
+
+static int
+iterator_add(struct iterator *itr, const char *parent, uint8_t type)
+{
+       int err;
+       struct target *target;
+
+       if (itr->cur_size == itr->max_size) {
+               struct target *new;
+
+               new = realloc(itr->targets,
+                             sizeof(struct target) *
+                             itr->max_size * 2);
+               if (!new)
+                       return -ENOMEM;
+
+               itr->max_size *= 2;
+               itr->targets   = new;
+       }
+
+       target = itr->targets + itr->cur_size;
+
+       if (target_volume(type))
+               err = iterator_add_volume(itr, target, parent, type);
+       else
+               err = iterator_add_file(itr, target, parent, type);
+
+       if (err)
+               memset(target, 0, sizeof(*target));
+       else
+               itr->cur_size++;
+
+       return (err == -EEXIST ? 0 : err);
+}
+
+static void
+iterator_free(struct iterator *itr)
+{
+       free(itr->targets);
+       memset(itr, 0, sizeof(*itr));
+}
+
+static void
+vhd_util_scan_add_parent(struct iterator *itr,
+                        vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       uint8_t type;
+
+       if (vhd_parent_raw(vhd))
+               type = target_volume(image->target->type) ? 
+                       VHD_TYPE_RAW_VOLUME : VHD_TYPE_RAW_FILE;
+       else
+               type = target_volume(image->target->type) ? 
+                       VHD_TYPE_VHD_VOLUME : VHD_TYPE_VHD_FILE;
+
+       err = iterator_add(itr, image->parent, type);
+       if (err)
+               vhd_util_scan_error(image->parent, err);
+}
+
+static int
+vhd_util_scan_targets(int cnt, struct target *targets)
+{
+       int ret, err;
+       vhd_context_t vhd;
+       struct iterator itr;
+       struct target *target;
+       struct vhd_image image;
+
+       ret = 0;
+       err = 0;
+
+       err = iterator_init(&itr, cnt, targets);
+       if (err)
+               return err;
+
+       while ((target = iterator_next(&itr))) {
+               memset(&vhd, 0, sizeof(vhd));
+               memset(&image, 0, sizeof(image));
+
+               image.target = target;
+
+               err = vhd_util_scan_open(&vhd, &image);
+               if (err) {
+                       ret = -EAGAIN;
+                       goto end;
+               }
+
+               err = vhd_util_scan_get_size(&vhd, &image);
+               if (err) {
+                       ret           = -EAGAIN;
+                       image.message = "getting physical size";
+                       image.error   = err;
+                       goto end;
+               }
+
+               err = vhd_util_scan_get_hidden(&vhd, &image);
+               if (err) {
+                       ret           = -EAGAIN;
+                       image.message = "checking 'hidden' field";
+                       image.error   = err;
+                       goto end;
+               }
+
+               if (vhd.footer.type == HD_TYPE_DIFF) {
+                       err = vhd_util_scan_get_parent(&vhd, &image);
+                       if (err) {
+                               ret           = -EAGAIN;
+                               image.message = "getting parent";
+                               image.error   = err;
+                               goto end;
+                       }
+               }
+
+       end:
+               vhd_util_scan_print_image(&image);
+
+               if (flags & VHD_SCAN_PARENTS && image.parent)
+                       vhd_util_scan_add_parent(&itr, &vhd, &image);
+
+               if (vhd.file)
+                       vhd_close(&vhd);
+               if (image.name != target->name)
+                       free(image.name);
+               free(image.parent);
+
+               if (err && !(flags & VHD_SCAN_NOFAIL))
+                       break;
+       }
+
+       iterator_free(&itr);
+
+       if (flags & VHD_SCAN_NOFAIL)
+               return ret;
+
+       return err;
+}
+
+static int
+vhd_util_scan_targets_pretty(int cnt, struct target *targets)
+{
+       int err;
+
+       err = vhd_util_scan_pretty_allocate_list(cnt);
+       if (err) {
+               printf("scan failed: no memory\n");
+               return -ENOMEM;
+       }
+
+       err = vhd_util_scan_targets(cnt, targets);
+
+       vhd_util_scan_pretty_print_images();
+       vhd_util_scan_pretty_free_list();
+
+       return ((flags & VHD_SCAN_NOFAIL) ? 0 : err);
+}
+
+static int
+vhd_util_scan_find_file_targets(int cnt, char **names,
+                               const char *filter,
+                               struct target **_targets, int *_total)
+{
+       glob_t g;
+       struct target *targets;
+       int i, globs, err, total;
+
+       total     = cnt;
+       globs     = 0;
+       *_total   = 0;
+       *_targets = NULL;
+       
+       memset(&g, 0, sizeof(g));
+
+       if (filter) {
+               int gflags = ((flags & VHD_SCAN_FAST) ? GLOB_NOSORT : 0);
+
+               errno = 0;
+               err   = glob(filter, gflags, vhd_util_scan_error, &g);
+
+               switch (err) {
+               case GLOB_NOSPACE:
+                       err = -ENOMEM;
+                       break;
+               case GLOB_ABORTED:
+                       err = -EIO;
+                       break;
+               case GLOB_NOMATCH:
+                       err = -errno;
+                       break;
+               }
+
+               if (err) {
+                       vhd_util_scan_error(filter, err);
+                       return err;
+               }
+
+               globs  = g.gl_pathc;
+               total += globs;
+       }
+
+       targets = calloc(total, sizeof(struct target));
+       if (!targets) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < g.gl_pathc; i++) {
+               err = vhd_util_scan_init_file_target(targets + i,
+                                                    g.gl_pathv[i],
+                                                    VHD_TYPE_VHD_FILE);
+               if (err) {
+                       vhd_util_scan_error(g.gl_pathv[i], err);
+                       if (!(flags & VHD_SCAN_NOFAIL))
+                               goto out;
+               }
+       }
+
+       for (i = 0; i + globs < total; i++) {
+               err = vhd_util_scan_init_file_target(targets + i + globs,
+                                                    names[i],
+                                                    VHD_TYPE_VHD_FILE);
+               if (err) {
+                       vhd_util_scan_error(names[i], err);
+                       if (!(flags & VHD_SCAN_NOFAIL))
+                               goto out;
+               }
+       }
+
+       err       = 0;
+       *_total   = total;
+       *_targets = targets;
+
+out:
+       if (err)
+               free(targets);
+       if (filter)
+               globfree(&g);
+
+       return err;
+}
+
+static inline void
+swap_volume(struct lv *lvs, int dst, int src)
+{
+       struct lv copy, *ldst, *lsrc;
+
+       if (dst == src)
+               return;
+
+       lsrc = lvs + src;
+       ldst = lvs + dst;
+
+       memcpy(&copy, ldst, sizeof(copy));
+       memcpy(ldst, lsrc, sizeof(*ldst));
+       memcpy(lsrc, &copy, sizeof(copy));
+}
+
+static int
+vhd_util_scan_sort_volumes(struct lv *lvs, int cnt,
+                          const char *filter, int *_matches)
+{
+       struct lv *lv;
+       int i, err, matches;
+
+       matches   = 0;
+       *_matches = 0;
+
+       if (!filter)
+               return 0;
+
+       for (i = 0; i < cnt; i++) {
+               lv  = lvs + i;
+
+               err = fnmatch(filter, lv->name, FNM_PATHNAME);
+               if (err) {
+                       if (err != FNM_NOMATCH) {
+                               vhd_util_scan_error(lv->name, err);
+                               if (!(flags & VHD_SCAN_NOFAIL))
+                                       return err;
+                       }
+
+                       continue;
+               }
+
+               swap_volume(lvs, matches++, i);
+       }
+
+       *_matches = matches;
+       return 0;
+}
+
+static int
+vhd_util_scan_find_volume_targets(int cnt, char **names,
+                                 const char *volume, const char *filter,
+                                 struct target **_targets, int *_total)
+{
+       struct target *targets;
+       int i, err, total, matches;
+
+       *_total   = 0;
+       *_targets = NULL;
+       targets   = NULL;
+
+       err = lvm_scan_vg(volume, &vg);
+       if (err)
+               return err;
+
+       err = vhd_util_scan_sort_volumes(vg.lvs, vg.lv_cnt,
+                                        filter, &matches);
+       if (err)
+               goto out;
+
+       total = matches;
+       for (i = 0; i < cnt; i++) {
+               err = vhd_util_scan_sort_volumes(vg.lvs + total,
+                                                vg.lv_cnt - total,
+                                                names[i], &matches);
+               if (err)
+                       goto out;
+
+               total += matches;
+       }
+
+       targets = calloc(total, sizeof(struct target));
+       if (!targets) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < total; i++) {
+               err = vhd_util_scan_init_volume_target(targets + i,
+                                                      vg.lvs + i,
+                                                      VHD_TYPE_VHD_VOLUME);
+               if (err) {
+                       vhd_util_scan_error(vg.lvs[i].name, err);
+                       if (!(flags & VHD_SCAN_NOFAIL))
+                               goto out;
+               }
+       }
+
+       err       = 0;
+       *_total   = total;
+       *_targets = targets;
+
+out:
+       if (err)
+               free(targets);
+       return err;
+}
+
+static int
+vhd_util_scan_find_targets(int cnt, char **names,
+                          const char *volume, const char *filter,
+                          struct target **targets, int *total)
+{
+       if (flags & VHD_SCAN_VOLUME)
+               return vhd_util_scan_find_volume_targets(cnt, names,
+                                                        volume, filter,
+                                                        targets, total);
+       return vhd_util_scan_find_file_targets(cnt, names,
+                                              filter, targets, total);
+}
+
+int
+vhd_util_scan(int argc, char **argv)
+{
+       int c, ret, err, cnt;
+       char *filter, *volume;
+       struct target *targets;
+
+       cnt     = 0;
+       ret     = 0;
+       err     = 0;
+       flags   = 0;
+       filter  = NULL;
+       volume  = NULL;
+       targets = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "m:fcl:pavh")) != -1) {
+               switch (c) {
+               case 'm':
+                       filter = optarg;
+                       break;
+               case 'f':
+                       flags |= VHD_SCAN_FAST;
+                       break;
+               case 'c':
+                       flags |= VHD_SCAN_NOFAIL;
+                       break;
+               case 'l':
+                       volume = optarg;
+                       flags |= VHD_SCAN_VOLUME;
+                       break;
+               case 'p':
+                       flags |= VHD_SCAN_PRETTY;
+                       break;
+               case 'a':
+                       flags |= VHD_SCAN_PARENTS;
+                       break;
+               case 'v':
+                       flags |= VHD_SCAN_VERBOSE;
+                       break;
+               case 'h':
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!filter && argc - optind == 0) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       if (flags & VHD_SCAN_PRETTY)
+               flags &= ~VHD_SCAN_FAST;
+
+       err = vhd_util_scan_find_targets(argc - optind, argv + optind,
+                                        volume, filter, &targets, &cnt);
+       if (err) {
+               printf("scan failed: %d\n", err);
+               return err;
+       }
+
+       if (!cnt)
+               return 0;
+
+       if (flags & VHD_SCAN_PRETTY)
+               err = vhd_util_scan_targets_pretty(cnt, targets);
+       else
+               err = vhd_util_scan_targets(cnt, targets);
+
+       free(targets);
+       lvm_free_vg(&vg);
+
+       return ((flags & VHD_SCAN_NOFAIL) ? 0 : err);
+
+usage:
+       printf("usage: [OPTIONS] FILES\n"
+              "options: [-m match filter] [-f fast] [-c continue on failure] "
+              "[-l LVM volume] [-p pretty print] [-a scan parents] "
+              "[-v verbose] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-set-field.c b/tools/blktap2/vhd/lib/vhd-util-set-field.c
new file mode 100644 (file)
index 0000000..32728ab
--- /dev/null
@@ -0,0 +1,106 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_set_field(int argc, char **argv)
+{
+       long value;
+       int err, c;
+       off_t eof;
+       vhd_context_t vhd;
+       char *name, *field;
+
+       err   = -EINVAL;
+       value = 0;
+       name  = NULL;
+       field = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:f:v:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'f':
+                       field = optarg;
+                       break;
+               case 'v':
+                       err   = 0;
+                       value = strtol(optarg, NULL, 10);
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || !field || optind != argc || err)
+               goto usage;
+
+       if (strnlen(field, 25) >= 25) {
+               printf("invalid field\n");
+               goto usage;
+       }
+
+       if (strcmp(field, "hidden")) {
+               printf("invalid field %s\n", field);
+               goto usage;
+       }
+
+       if (value < 0 || value > 255) {
+               printf("invalid value %ld\n", value);
+               goto usage;
+       }
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       vhd.footer.hidden = (char)value;
+
+       err = vhd_write_footer(&vhd, &vhd.footer);
+               
+ done:
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> <-f field> <-v value> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-snapshot.c b/tools/blktap2/vhd/lib/vhd-util-snapshot.c
new file mode 100644 (file)
index 0000000..75960f9
--- /dev/null
@@ -0,0 +1,216 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+static int
+vhd_util_find_snapshot_target(const char *name, char **result, int *parent_raw)
+{
+       int i, err;
+       char *target;
+       vhd_context_t vhd;
+
+       *parent_raw = 0;
+       *result     = NULL;
+
+       target = strdup(name);
+       if (!target)
+               return -ENOMEM;
+
+       for (;;) {
+               err = vhd_open(&vhd, target, VHD_OPEN_RDONLY);
+               if (err)
+                       return err;
+
+               if (vhd.footer.type != HD_TYPE_DIFF)
+                       goto out;
+
+               err = vhd_get_bat(&vhd);
+               if (err)
+                       goto out;
+
+               for (i = 0; i < vhd.bat.entries; i++)
+                       if (vhd.bat.bat[i] != DD_BLK_UNUSED)
+                               goto out;
+
+               free(target);
+               err = vhd_parent_locator_get(&vhd, &target);
+               if (err)
+                       goto out;
+
+               if (vhd_parent_raw(&vhd)) {
+                       *parent_raw = 1;
+                       goto out;
+               }
+
+               vhd_close(&vhd);
+       }
+
+out:
+       vhd_close(&vhd);
+       if (err)
+               free(target);
+       else
+               *result = target;
+
+       return err;
+}
+
+static int
+vhd_util_check_depth(const char *name, int *depth)
+{
+       int err;
+       vhd_context_t vhd;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+       if (err)
+               return err;
+
+       err = vhd_chain_depth(&vhd, depth);
+       vhd_close(&vhd);
+
+       return err;
+}
+
+int
+vhd_util_snapshot(int argc, char **argv)
+{
+       vhd_flag_creat_t flags;
+       int c, err, prt_raw, limit;
+       char *name, *pname, *ppath, *backing;
+       uint64_t size;
+       vhd_context_t vhd;
+
+       name    = NULL;
+       pname   = NULL;
+       ppath   = NULL;
+       backing = NULL;
+       size    = 0;
+       flags   = 0;
+       limit   = 0;
+
+       if (!argc || !argv) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:p:l:mh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'p':
+                       pname = optarg;
+                       break;
+               case 'l':
+                       limit = strtol(optarg, NULL, 10);
+                       break;
+               case 'm':
+                       vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW);
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!name || !pname || optind != argc) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       ppath = realpath(pname, NULL);
+       if (!ppath)
+               return -errno;
+
+       if (vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) {
+               backing = strdup(ppath);
+               if (!backing) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+       } else {
+               err = vhd_util_find_snapshot_target(ppath, &backing, &prt_raw);
+               if (err) {
+                       backing = NULL;
+                       goto out;
+               }
+
+               /* 
+                * if the sizes of the parent chain are non-uniform, we need to 
+                * pick the right size: that of the supplied parent
+                */
+               if (strcmp(ppath, backing)) {
+                       err = vhd_open(&vhd, ppath, VHD_OPEN_RDONLY);
+                       if (err)
+                               goto out;
+                       size = vhd.footer.curr_size;
+                       vhd_close(&vhd);
+               }
+
+               if (prt_raw)
+                       vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW);
+       }
+
+       if (limit && !vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) {
+               int depth;
+
+               err = vhd_util_check_depth(backing, &depth);
+               if (err)
+                       printf("error checking snapshot depth: %d\n", err);
+               else if (depth + 1 > limit) {
+                       err = -ENOSPC;
+                       printf("snapshot depth exceeded: "
+                              "current depth: %d, limit: %d\n", depth, limit);
+               }
+
+               if (err)
+                       goto out;
+       }
+
+       err = vhd_snapshot(name, size, backing, flags);
+
+out:
+       free(ppath);
+       free(backing);
+
+       return err;
+
+usage:
+       printf("options: <-n name> <-p parent name> [-l snapshot depth limit]"
+              " [-m parent_is_raw] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-uuid.c b/tools/blktap2/vhd/lib/vhd-util-uuid.c
new file mode 100644 (file)
index 0000000..7326c4e
--- /dev/null
@@ -0,0 +1,128 @@
+ /* Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2011, Citrix
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if defined(__linux__)
+
+#include <uuid/uuid.h>
+
+typedef struct {
+    uuid_t uuid;
+} vhd_uuid_t;
+
+int vhd_uuid_is_nil(vhd_uuid_t *uuid)
+{
+       return uuid_is_null(uuid->uuid);
+}
+
+void vhd_uuid_generate(vhd_uuid_t *uuid)
+{
+       uuid_generate(uuid->uuid);
+}
+
+void vhd_uuid_to_string(vhd_uuid_t *uuid, char *out, size_t size)
+{
+       uuid_unparse(uuid->uuid, out);
+}
+
+void vhd_uuid_from_string(vhd_uuid_t *uuid, const char *in)
+{
+       uuid_parse(in, uuid->uuid);
+}
+
+void vhd_uuid_copy(vhd_uuid_t *dst, vhd_uuid_t *src)
+{
+       uuid_copy(dst->uuid, src->uuid);
+}
+
+void vhd_uuid_clear(vhd_uuid_t *uuid)
+{
+       uuid_clear(uuid->uuid);
+}
+
+int vhd_uuid_compare(vhd_uuid_t *uuid1, vhd_uuid_t *uuid2)
+{
+       return uuid_compare(uuid1->uuid, uuid2->uuid);
+}
+
+#elif defined(__NetBSD__)
+
+#include <uuid.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef uuid_t vhd_uuid_t;
+
+int vhd_uuid_is_nil(vhd_uuid_t *uuid)
+{
+       uint32_t status;
+       return uuid_is_nil((uuid_t *)uuid, &status);
+}
+
+void vhd_uuid_generate(vhd_uuid_t *uuid)
+{
+       uint32_t status;
+       uuid_create((uuid_t *)uuid, &status);
+}
+
+void vhd_uuid_to_string(vhd_uuid_t *uuid, char *out, size_t size)
+{
+       uint32_t status;
+       char *_out = NULL;
+       uuid_to_string((uuid_t *)uuid, &_out, &status);
+       strlcpy(out, _out, size);
+       free(_out);
+}
+
+void vhd_uuid_from_string(vhd_uuid_t *uuid, const char *in)
+{
+       uint32_t status;
+       uuid_from_string(in, (uuid_t *)uuid, &status);
+}
+
+void vhd_uuid_copy(vhd_uuid_t *dst, vhd_uuid_t *src)
+{
+       memcpy((uuid_t *)dst, (uuid_t *)src, sizeof(uuid_t));
+}
+
+void vhd_uuid_clear(vhd_uuid_t *uuid)
+{
+       memset((uuid_t *)uuid, 0, sizeof(uuid_t));
+}
+
+int vhd_uuid_compare(vhd_uuid_t *uuid1, vhd_uuid_t *uuid2)
+{
+       uint32_t status;
+       return uuid_compare((uuid_t *)uuid1, (uuid_t *)uuid2, &status);
+}
+
+#else
+
+#error "Please update vhd-util-uuid.c for your OS"
+
+#endif
diff --git a/tools/blktap2/vhd/vhd-update.c b/tools/blktap2/vhd/vhd-update.c
new file mode 100644 (file)
index 0000000..4621a81
--- /dev/null
@@ -0,0 +1,259 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Before updating a VHD file, we create a journal consisting of:
+ *   - all data at the beginning of the file, up to and including the BAT
+ *   - each allocated bitmap (existing at the same offset in the journal as
+ *                            its corresponding bitmap in the original file)
+ * Updates are performed in place by writing appropriately 
+ * transformed versions of journaled bitmaps to the original file.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "atomicio.h"
+#include "libvhd.h"
+#include "libvhd-journal.h"
+
+static void
+usage(void)
+{
+       printf("usage: vhd-update <-n name> [-j existing journal] [-h]\n");
+       exit(EINVAL);
+}
+
+/*
+ * update vhd creator version to reflect its new bitmap ordering
+ */
+static inline int
+update_creator_version(vhd_journal_t *journal)
+{
+       journal->vhd.footer.crtr_ver = VHD_VERSION(1, 1);
+       return vhd_write_footer(&journal->vhd, &journal->vhd.footer);
+}
+
+static int
+journal_bitmaps(vhd_journal_t *journal)
+{
+       int i, err;
+
+       for (i = 0; i < journal->vhd.bat.entries; i++) {
+               err = vhd_journal_add_block(journal, i, VHD_JOURNAL_METADATA);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+/*
+ * older VHD bitmaps were little endian
+ * and bits within a word were set from right to left
+ */
+static inline int
+old_test_bit(int nr, volatile void * addr)
+{
+        return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
+                (nr % (sizeof(unsigned long)*8))) & 1;
+}
+
+/*
+ * new VHD bitmaps are big endian
+ * and bits within a word are set from left to right
+ */
+#define BIT_MASK 0x80
+static inline void
+new_set_bit (int nr, volatile char *addr)
+{
+        addr[nr >> 3] |= (BIT_MASK >> (nr & 7));
+}
+
+static void
+convert_bitmap(char *in, char *out, int bytes)
+{
+       int i;
+
+       memset(out, 0, bytes);
+
+       for (i = 0; i < bytes << 3; i++)
+               if (old_test_bit(i, (void *)in))
+                       new_set_bit(i, out);
+}
+
+static int
+update_vhd(vhd_journal_t *journal, int rollback)
+{
+       int i, err;
+       size_t size;
+       char *buf, *converted;
+
+       buf       = NULL;
+       converted = NULL;
+
+       size = vhd_bytes_padded(journal->vhd.spb / 8);
+       err  = posix_memalign((void **)&converted, 512, size);
+       if (err) {
+               converted = NULL;
+               goto out;
+       }
+
+       for (i = 0; i < journal->vhd.bat.entries; i++) {
+               if (journal->vhd.bat.bat[i] == DD_BLK_UNUSED)
+                       continue;
+
+               err = vhd_read_bitmap(&journal->vhd, i, &buf);
+               if (err)
+                       goto out;
+
+               if (rollback)
+                       memcpy(converted, buf, size);
+               else
+                       convert_bitmap(buf, converted, size);
+
+               free(buf);
+
+               err = vhd_write_bitmap(&journal->vhd, i, converted);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+ out:
+       free(converted);
+       return err;
+}
+
+static int
+open_journal(vhd_journal_t *journal, const char *file, const char *jfile)
+{
+       int err;
+
+       err = vhd_journal_create(journal, file, jfile);
+       if (err) {
+               printf("error creating journal for %s: %d\n", file, err);
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+close_journal(vhd_journal_t *journal, int err)
+{
+       if (err)
+               err = vhd_journal_revert(journal);
+       else
+               err = vhd_journal_commit(journal);
+
+       if (err)
+               return vhd_journal_close(journal);
+       else
+               return vhd_journal_remove(journal);
+}
+
+int
+main(int argc, char **argv)
+{
+       char *file, *jfile;
+       int c, err, rollback;
+       vhd_journal_t journal;
+
+       file     = NULL;
+       jfile    = NULL;
+       rollback = 0;
+
+       while ((c = getopt(argc, argv, "n:j:rh")) != -1) {
+               switch(c) {
+               case 'n':
+                       file = optarg;
+                       break;
+               case 'j':
+                       jfile = optarg;
+                       err = access(jfile, R_OK);
+                       if (err == -1) {
+                               printf("invalid journal arg %s\n", jfile);
+                               return -errno;
+                       }
+                       break;
+               case 'r':
+                       /* add a rollback option for debugging which
+                        * pushes journalled bitmaps to original file
+                        * without transforming them */
+                       rollback = 1;
+                       break;
+               default:
+                       usage();
+               }
+       }
+
+       if (!file)
+               usage();
+
+       if (rollback && !jfile) {
+               printf("rollback requires a journal argument\n");
+               usage();
+       }
+
+       err = open_journal(&journal, file, jfile);
+       if (err)
+               return err;
+
+       if (!vhd_creator_tapdisk(&journal.vhd) ||
+           journal.vhd.footer.crtr_ver != VHD_VERSION(0, 1) ||
+           journal.vhd.footer.type == HD_TYPE_FIXED) {
+               err = 0;
+               goto out;
+       }
+
+       err = journal_bitmaps(&journal);
+       if (err) {
+               /* no changes to vhd file yet,
+                * so close the journal and bail */
+               vhd_journal_close(&journal);
+               return err;
+       }
+
+       err = update_vhd(&journal, rollback);
+       if (err) {
+               printf("update failed: %d; saving journal\n", err);
+               goto out;
+       }
+
+       err = update_creator_version(&journal);
+       if (err) {
+               printf("failed to udpate creator version: %d\n", err);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       err = close_journal(&journal, err);
+       return err;
+}
diff --git a/tools/blktap2/vhd/vhd-util.c b/tools/blktap2/vhd/vhd-util.c
new file mode 100644 (file)
index 0000000..13f1835
--- /dev/null
@@ -0,0 +1,163 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <langinfo.h>
+#include <locale.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf(stdout, _f , ##_a)
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+typedef int (*vhd_util_func_t) (int, char **);
+
+struct command {
+       char               *name;
+       vhd_util_func_t     func;
+};
+
+struct command commands[] = {
+       { .name = "create",      .func = vhd_util_create        },
+       { .name = "snapshot",    .func = vhd_util_snapshot      },
+       { .name = "query",       .func = vhd_util_query         },
+       { .name = "read",        .func = vhd_util_read          },
+       { .name = "set",         .func = vhd_util_set_field     },
+       { .name = "repair",      .func = vhd_util_repair        },
+       { .name = "resize",      .func = vhd_util_resize        },
+       { .name = "fill",        .func = vhd_util_fill          },
+       { .name = "coalesce",    .func = vhd_util_coalesce      },
+       { .name = "modify",      .func = vhd_util_modify        },
+       { .name = "scan",        .func = vhd_util_scan          },
+       { .name = "check",       .func = vhd_util_check         },
+       { .name = "revert",      .func = vhd_util_revert        },
+};
+
+#define print_commands()                                       \
+       do {                                                    \
+               int i, n;                                       \
+               n = sizeof(commands) / sizeof(struct command);  \
+               printf("COMMAND := { ");                        \
+               printf("%s", commands[0].name);                 \
+               for (i = 1; i < n; i++)                         \
+                       printf(" | %s", commands[i].name);      \
+               printf(" }\n");                                 \
+       } while (0)
+
+TEST_FAIL_EXTERN_VARS;
+
+void
+help(void)
+{
+       printf("usage: vhd-util COMMAND [OPTIONS]\n");
+       print_commands();
+       exit(0);
+}
+
+struct command *
+get_command(char *command)
+{
+       int i, n;
+
+       if (strnlen(command, 25) >= 25)
+               return NULL;
+
+       n = sizeof(commands) / sizeof (struct command);
+
+       for (i = 0; i < n; i++)
+               if (!strcmp(command, commands[i].name))
+                       return &commands[i];
+
+       return NULL;
+}
+
+int
+main(int argc, char *argv[])
+{
+       char **cargv;
+       struct command *cmd;
+       int cargc, i, cnt, ret;
+
+#ifdef CORE_DUMP
+       #include <sys/resource.h>
+       struct rlimit rlim;
+       rlim.rlim_cur = RLIM_INFINITY;
+       rlim.rlim_max = RLIM_INFINITY;
+       if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+               fprintf(stderr, "setrlimit failed: %d\n", errno);
+#endif
+       setlocale(LC_CTYPE, "");
+
+       ret = 0;
+
+       if (argc < 2)
+               help();
+
+       cargc = argc - 1;
+       cmd   = get_command(argv[1]);
+       if (!cmd) {
+               fprintf(stderr, "invalid COMMAND %s\n", argv[1]);
+               help();
+       }
+
+       cargv = malloc(sizeof(char *) * cargc);
+       if (!cargv)
+               exit(ENOMEM);
+
+       cnt      = 1;
+       cargv[0] = cmd->name;
+       for (i = 1; i < cargc; i++) {
+               char *arg = argv[i + (argc - cargc)];
+
+               if (!strcmp(arg, "--debug")) {
+                       libvhd_set_log_level(1);
+                       continue;
+               }
+
+               cargv[cnt++] = arg;
+       }
+
+#ifdef ENABLE_FAILURE_TESTING
+       for (i = 0; i < NUM_FAIL_TESTS; i++) {
+               TEST_FAIL[i] = 0;
+               if (getenv(ENV_VAR_FAIL[i]))
+                       TEST_FAIL[i] = 1;
+       }
+#endif // ENABLE_FAILURE_TESTING
+
+       ret = cmd->func(cnt, cargv);
+
+       free(cargv);
+
+       return (ret >= 0 ? ret : -ret);
+}