--- /dev/null
+XEN_ROOT = $(CURDIR)/../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS += $(CFLAGS_libxenctrl)
+LDLIBS += $(LDLIBS_libxenctrl)
+
+SUBDIRS-y :=
+SUBDIRS-y += include
+SUBDIRS-y += lvm
+SUBDIRS-y += vhd
+SUBDIRS-$(CONFIG_Linux) += drivers
+SUBDIRS-$(CONFIG_Linux) += control
+
+clean:
+ rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) TAGS
+
+distclean: clean
+
+.PHONY: all clean install distclean
+all clean install distclean: %: subdirs-%
--- /dev/null
+Blktap2 Userspace Tools + Library
+================================
+
+Dutch Meyer
+4th June 2009
+
+Andrew Warfield and Julian Chesterfield
+16th June 2006
+
+
+The blktap2 userspace toolkit provides a user-level disk I/O
+interface. The blktap2 mechanism involves a kernel driver that acts
+similarly to the existing Xen/Linux blkback driver, and a set of
+associated user-level libraries. Using these tools, blktap2 allows
+virtual block devices presented to VMs to be implemented in userspace
+and to be backed by raw partitions, files, network, etc.
+
+The key benefit of blktap2 is that it makes it easy and fast to write
+arbitrary block backends, and that these user-level backends actually
+perform very well. Specifically:
+
+- Metadata disk formats such as Copy-on-Write, encrypted disks, sparse
+ formats and other compression features can be easily implemented.
+
+- Accessing file-based images from userspace avoids problems related
+ to flushing dirty pages which are present in the Linux loopback
+ driver. (Specifically, doing a large number of writes to an
+ NFS-backed image don't result in the OOM killer going berserk.)
+
+- Per-disk handler processes enable easier userspace policing of block
+ resources, and process-granularity QoS techniques (disk scheduling
+ and related tools) may be trivially applied to block devices.
+
+- It's very easy to take advantage of userspace facilities such as
+ networking libraries, compression utilities, peer-to-peer
+ file-sharing systems and so on to build more complex block backends.
+
+- Crashes are contained -- incremental development/debugging is very
+ fast.
+
+How it works (in one paragraph):
+
+Working in conjunction with the kernel blktap2 driver, all disk I/O
+requests from VMs are passed to the userspace deamon (using a shared
+memory interface) through a character device. Each active disk is
+mapped to an individual device node, allowing per-disk processes to
+implement individual block devices where desired. The userspace
+drivers are implemented using asynchronous (Linux libaio),
+O_DIRECT-based calls to preserve the unbuffered, batched and
+asynchronous request dispatch achieved with the existing blkback
+code. We provide a simple, asynchronous virtual disk interface that
+makes it quite easy to add new disk implementations.
+
+As of June 2009 the current supported disk formats are:
+
+ - Raw Images (both on partitions and in image files)
+ - Fast sharable RAM disk between VMs (requires some form of
+ cluster-based filesystem support e.g. OCFS2 in the guest kernel)
+ - VHD, including snapshots and sparse images
+ - Qcow, including snapshots and sparse images
+
+
+Build and Installation Instructions
+===================================
+
+Make to configure the blktap2 backend driver in your dom0 kernel. It
+will inter-operate with the existing backend and frontend drivers. It
+will also cohabitate with the original blktap driver. However, some
+formats (currently aio and qcow) will default to their blktap2
+versions when specified in a vm configuration file.
+
+To build the tools separately, "make && make install" in
+tools/blktap2.
+
+
+Using the Tools
+===============
+
+Preparing an image for boot:
+
+The userspace disk agent is configured to start automatically via xend
+
+Customize the VM config file to use the 'tap:tapdisk' handler,
+followed by the driver type. e.g. for a raw image such as a file or
+partition:
+
+disk = ['tap:tapdisk:aio:<FILENAME>,sda1,w']
+
+Alternatively, the vhd-util tool (installed with make install, or in
+/blktap2/vhd) can be used to build sparse copy-on-write vhd images.
+
+For example, to build a sparse image -
+ vhd-util create -n MyVHDFile -s 1024
+
+This creates a sparse 1GB file named "MyVHDFile" that can be mounted
+and populated with data.
+
+One can also base the image on a raw file -
+ vhd-util snapshot -n MyVHDFile -p SomeRawFile -m
+
+This creates a sparse VHD file named "MyVHDFile" using "SomeRawFile"
+as a parent image. Copy-on-write semantics ensure that writes will be
+stored in "MyVHDFile" while reads will be directed to the most
+recently written version of the data, either in "MyVHDFile" or
+"SomeRawFile" as is appropriate. Other options exist as well, consult
+the vhd-util application for the complete set of VHD tools.
+
+VHD files can be mounted automatically in a guest similarly to the
+above AIO example simply by specifying the vhd driver.
+
+disk = ['tap:tapdisk:vhd:<VHD FILENAME>,sda1,w']
+
+
+Snapshots:
+
+Pausing a guest will also plug the corresponding IO queue for blktap2
+devices and stop blktap2 drivers. This can be used to implement a
+safe live snapshot of qcow and vhd disks. An example script "xmsnap"
+is shown in the tools/blktap2/drivers directory. This script will
+perform a live snapshot of a qcow disk. VHD files can use the
+"vhd-util snapshot" tool discussed above. If this snapshot command is
+applied to a raw file mounted with tap:tapdisk:AIO, include the -m
+flag and the driver will be reloaded as VHD. If applied to an already
+mounted VHD file, omit the -m flag.
+
+
+Mounting images in Dom0 using the blktap2 driver
+===============================================
+Tap (and blkback) disks are also mountable in Dom0 without requiring an
+active VM to attach.
+
+The syntax is -
+ tapdisk2 -n <type>:<full path to file>
+
+For example -
+ tapdisk2 -n aio:/home/images/rawFile.img
+
+When successful the location of the new device will be provided by
+tapdisk2 to stdout and tapdisk2 will terminate. From that point
+forward control of the device is provided through sysfs in the
+directory-
+
+ /sys/class/blktap2/blktap#/
+
+Where # is a blktap2 device number present in the path that tapdisk2
+printed before terminating. The sysfs interface is largely intuitive,
+for example, to remove tap device 0 one would-
+
+ echo 1 > /sys/class/blktap2/blktap0/remove
+
+Similarly, a pause control is available, which is can be used to plug
+the request queue of a live running guest.
+
+Previous versions of blktap mounted devices in dom0 by using blkfront
+in dom0 and the xm block-attach command. This approach is still
+available, though slightly more cumbersome.
+
+
+Tapdisk Development
+===============================================
+
+People regularly ask how to develop their own tapdisk drivers, and
+while it has not yet been well documented, the process is relatively
+easy. Here I will provide a brief overview. The best reference, of
+course, comes from the existing drivers. Specifically,
+blktap2/drivers/block-ram.c and blktap2/drivers/block-aio.c provide
+the clearest examples of simple drivers.
+
+
+Setup:
+
+First you need to register your new driver with blktap. This is done
+in disktypes.h. There are five things that you must do. To
+demonstrate, I will create a disk called "mynewdisk", you can name
+yours freely.
+
+1) Forward declare an instance of struct tap_disk.
+
+e.g. -
+ extern struct tap_disk tapdisk_mynewdisk;
+
+2) Claim one of the unused disk type numbers, take care to observe the
+MAX_DISK_TYPES macro, increasing the number if necessary.
+
+e.g. -
+ #define DISK_TYPE_MYNEWDISK 10
+
+3) Create an instance of disk_info_t. The bulk of this file contains examples of these.
+
+e.g. -
+ static disk_info_t mynewdisk_disk = {
+ DISK_TYPE_MYNEWDISK,
+ "My New Disk (mynewdisk)",
+ "mynewdisk",
+ 0,
+ #ifdef TAPDISK
+ &tapdisk_mynewdisk,
+ #endif
+ };
+
+A few words about what these mean. The first field must be the disk
+type number you claimed in step (2). The second field is a string
+describing your disk, and may contain any relevant info. The third
+field is the name of your disk as will be used by the tapdisk2 utility
+and xend (for example tapdisk2 -n mynewdisk:/path/to/disk.image, or in
+your xm create config file). The forth is binary and determines
+whether you will have one instance of your driver, or many. Here, a 1
+means that your driver is a singleton and will coordinate access to
+any number of tap devices. 0 is more common, meaning that you will
+have one driver for each device that is created. The final field
+should contain a reference to the struct tap_disk you created in step
+(1).
+
+4) Add a reference to your disk info structure (from step (3)) to the
+dtypes array. Take care here - you need to place it in the position
+corresponding to the device type number you claimed in step (2). So
+we would place &mynewdisk_disk in dtypes[10]. Look at the other
+devices in this array and pad with "&null_disk," as necessary.
+
+5) Modify the xend python scripts. You need to add your disk name to
+the list of disks that xend recognizes.
+
+edit:
+ tools/python/xen/xend/server/BlktapController.py
+
+And add your disk to the "blktap_disk_types" array near the top of
+your file. Use the same name you specified in the third field of step
+(3). The order of this list is not important.
+
+
+Now your driver is ready to be written. Create a block-mynewdisk.c in
+tools/blktap2/drivers and add it to the Makefile.
+
+
+Development:
+
+Copying block-aio.c and block-ram.c would be a good place to start.
+Read those files as you go through this, I will be assisting by
+commenting on a few useful functions and structures.
+
+struct tap_disk:
+
+Remember the forward declaration in step (1) of the setup phase above?
+Now is the time to make that structure a reality. This structure
+contains a list of function pointers for all the routines that will be
+asked of your driver. Currently the required functions are open,
+close, read, write, get_parent_id, validate_parent, and debug.
+
+e.g. -
+ struct tap_disk tapdisk_mynewdisk = {
+ .disk_type = "tapdisk_mynewdisk",
+ .flags = 0,
+ .private_data_size = sizeof(struct tdmynewdisk_state),
+ .td_open = tdmynewdisk_open,
+ ....
+
+The private_data_size field is used to provide a structure to store
+the state of your device. It is very likely that you will want
+something here, but you are free to design whatever structure you
+want. Blktap will allocate this space for you, you just need to tell
+it how much space you want.
+
+
+tdmynewdisk_open:
+
+This is the open routine. The first argument is a structure
+representing your driver. Two fields in this array are
+interesting.
+
+driver->data will contain a block of memory of the size your requested
+in in the .private_data_size field of your struct tap_disk (above).
+
+driver->info contains a structure that details information about your
+disk. You need to fill this out. By convention this is done with a
+_get_image_info() function. Assign a size (the total number of
+sectors), sector_size (the size of each sector in bytes, and set
+driver->info->info to 0.
+
+The second parameter contains the name that was specified in the
+creation of your device, either through xend, or on the command line
+with tapdisk2. Usually this specifies a file that you will open in
+this routine. The final parameter, flags, contains one of a number of
+flags specified in tapdisk.h that may change the way you treat the
+disk.
+
+
+_queue_read/write:
+
+These are your read and write operations. What you do here will
+depend on your disk, but you should do exactly one of-
+
+1) call td_complete_request with either error or success code.
+
+2) Call td_forward_request, which will forward the request to the next
+driver in the stack.
+
+3) Queue the request for asynchronous processing with
+td_prep_read/write. In doing so, you will also register a callback
+for request completion. When the request completes you must do one of
+options (1) or (2) above. Finally, call td_queue_tiocb to submit the
+request to a wait queue.
+
+The above functions are defined in tapdisk-interface.c. If you don't
+use them as specified you will run into problems as your driver will
+fail to inform blktap of the state of requests that have been
+submitted. Blktap keeps track of all requests and does not like losing track.
+
+
+_close, _get_parent_id, _validate_parent:
+
+These last few tend to be very routine. _close is called when the
+device is closed, and also when it is paused (in this case, open will
+also be called later). The other functions are used in stacking
+drivers. Most often drivers will return TD_NO_PARENT and -EINVAL,
+respectively.
+
+
+
+
+
+
--- /dev/null
+XEN_ROOT := $(CURDIR)/../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+MAJOR = 1.0
+MINOR = 0
+LIBNAME = libblktapctl
+LIBSONAME = $(LIBNAME).so.$(MAJOR)
+
+IBIN = tap-ctl
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -I../include -I../drivers
+CFLAGS += $(CFLAGS_xeninclude)
+CFLAGS += $(CFLAGS_libxenctrl)
+CFLAGS += -D_GNU_SOURCE
+CFLAGS += -DTAPCTL
+
+CTL_OBJS := tap-ctl-ipc.o
+CTL_OBJS += tap-ctl-list.o
+CTL_OBJS += tap-ctl-allocate.o
+CTL_OBJS += tap-ctl-free.o
+CTL_OBJS += tap-ctl-create.o
+CTL_OBJS += tap-ctl-destroy.o
+CTL_OBJS += tap-ctl-spawn.o
+CTL_OBJS += tap-ctl-attach.o
+CTL_OBJS += tap-ctl-detach.o
+CTL_OBJS += tap-ctl-open.o
+CTL_OBJS += tap-ctl-close.o
+CTL_OBJS += tap-ctl-pause.o
+CTL_OBJS += tap-ctl-unpause.o
+CTL_OBJS += tap-ctl-major.o
+CTL_OBJS += tap-ctl-check.o
+
+CTL_PICS = $(patsubst %.o,%.opic,$(CTL_OBJS))
+
+OBJS = $(CTL_OBJS) tap-ctl.o
+PICS = $(CTL_PICS)
+
+LIB_STATIC = $(LIBNAME).a
+LIB_SHARED = $(LIBSONAME).$(MINOR)
+IBIN = tap-ctl
+
+all: build
+
+build: $(IBIN) $(LIB_STATIC) $(LIB_SHARED)
+
+$(LIBNAME).so: $(LIBSONAME)
+ ln -sf $< $@
+
+$(LIBSONAME): $(LIB_SHARED)
+ ln -sf $< $@
+
+tap-ctl: tap-ctl.o $(LIBNAME).so
+ $(CC) $(LDFLAGS) -o $@ $^ $(APPEND_LDFLAGS)
+
+$(LIB_STATIC): $(CTL_OBJS)
+ $(AR) r $@ $^
+
+$(LIB_SHARED): $(CTL_PICS)
+ $(CC) $(LDFLAGS) -fPIC -Wl,$(SONAME_LDFLAG) -Wl,$(LIBSONAME) $(SHLIB_LDFLAGS) -rdynamic $^ -o $@ $(APPEND_LDFLAGS)
+
+install: $(IBIN) $(LIB_STATIC) $(LIB_SHARED)
+ $(INSTALL_DIR) -p $(DESTDIR)$(sbindir)
+ $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(sbindir)
+ $(INSTALL_DATA) $(LIB_STATIC) $(DESTDIR)$(libdir)
+ $(INSTALL_PROG) $(LIB_SHARED) $(DESTDIR)$(libdir)
+ ln -sf $(LIBSONAME) $(DESTDIR)$(libdir)/$(LIBNAME).so
+ ln -sf $(LIB_SHARED) $(DESTDIR)$(libdir)/$(LIBSONAME)
+
+clean:
+ rm -f $(OBJS) $(PICS) $(DEPS) $(IBIN) $(LIB_STATIC) $(LIB_SHARED)
+ rm -f $(LIBNAME).so $(LIBSONAME)
+ rm -f *~
+
+distclean: clean
+
+.PHONY: all build clean distclean install
+
+-include $(DEPS)
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <linux/major.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+static int
+tap_ctl_prepare_directory(const char *dir)
+{
+ int err;
+ char *ptr, *name, *start;
+
+ err = access(dir, W_OK | R_OK);
+ if (!err)
+ return 0;
+
+ name = strdup(dir);
+ if (!name)
+ return ENOMEM;
+
+ start = name;
+
+ for (;;) {
+ ptr = strchr(start + 1, '/');
+ if (ptr)
+ *ptr = '\0';
+
+ err = mkdir(name, 0755);
+ if (err && errno != EEXIST) {
+ PERROR("mkdir %s", name);
+ err = errno;
+ break;
+ }
+
+ if (!ptr)
+ break;
+ else {
+ *ptr = '/';
+ start = ptr + 1;
+ }
+ }
+
+ free(name);
+ return err;
+}
+
+static int
+tap_ctl_make_device(const char *devname, const int major,
+ const int minor, const int perm)
+{
+ int err;
+ char *copy, *dir;
+
+ copy = strdup(devname);
+ if (!copy)
+ return ENOMEM;
+
+ dir = dirname(copy);
+
+ err = tap_ctl_prepare_directory(dir);
+ free(copy);
+
+ if (err)
+ return err;
+
+ if (!access(devname, F_OK))
+ if (unlink(devname)) {
+ PERROR("unlink %s", devname);
+ return errno;
+ }
+
+ err = mknod(devname, perm, makedev(major, minor));
+ if (err) {
+ PERROR("mknod %s", devname);
+ return errno;
+ }
+
+ return 0;
+}
+
+static int
+tap_ctl_check_environment(void)
+{
+ FILE *f;
+ int err, minor;
+ char name[256];
+
+ err = tap_ctl_prepare_directory(BLKTAP2_CONTROL_DIR);
+ if (err)
+ return err;
+
+ if (!access(BLKTAP2_CONTROL_DEVICE, R_OK | W_OK))
+ return 0;
+
+ memset(name, 0, sizeof(name));
+
+ f = fopen("/proc/misc", "r");
+ if (!f) {
+ EPRINTF("failed to open /proc/misc: %d\n", errno);
+ return errno;
+ }
+
+ while (fscanf(f, "%d %256s", &minor, name) == 2)
+ if (!strcmp(name, BLKTAP2_CONTROL_NAME)) {
+ err = tap_ctl_make_device(BLKTAP2_CONTROL_DEVICE,
+ MISC_MAJOR,
+ minor, S_IFCHR | 0600);
+ goto out;
+ }
+
+ err = ENOSYS;
+ EPRINTF("didn't find %s in /proc/misc\n", BLKTAP2_CONTROL_NAME);
+
+out:
+ fclose(f);
+ return err;
+}
+
+static int
+tap_ctl_allocate_device(int *minor, char **devname)
+{
+ char *name;
+ int fd, err;
+ struct blktap2_handle handle;
+
+ *minor = -1;
+ if (!devname)
+ return EINVAL;
+
+ fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY);
+ if (fd == -1) {
+ EPRINTF("failed to open control device: %d\n", errno);
+ return errno;
+ }
+
+ err = ioctl(fd, BLKTAP2_IOCTL_ALLOC_TAP, &handle);
+ close(fd);
+ if (err == -1) {
+ EPRINTF("failed to allocate new device: %d\n", errno);
+ return errno;
+ }
+
+ err = asprintf(&name, "%s%d", BLKTAP2_RING_DEVICE, handle.minor);
+ if (err == -1) {
+ err = ENOMEM;
+ goto fail;
+ }
+
+ err = tap_ctl_make_device(name, handle.ring,
+ handle.minor, S_IFCHR | 0600);
+ free(name);
+ if (err) {
+ EPRINTF("creating ring device for %d failed: %d\n",
+ handle.minor, err);
+ goto fail;
+ }
+
+ if (*devname)
+ name = *devname;
+ else {
+ err = asprintf(&name, "%s%d",
+ BLKTAP2_IO_DEVICE, handle.minor);
+ if (err == -1) {
+ err = ENOMEM;
+ goto fail;
+ }
+ *devname = name;
+ }
+
+ err = tap_ctl_make_device(name, handle.device,
+ handle.minor, S_IFBLK | 0600);
+ if (err) {
+ EPRINTF("creating IO device for %d failed: %d\n",
+ handle.minor, err);
+ goto fail;
+ }
+
+ DBG("new interface: ring: %u, device: %u, minor: %u\n",
+ handle.ring, handle.device, handle.minor);
+
+ *minor = handle.minor;
+ return 0;
+
+fail:
+ tap_ctl_free(handle.minor);
+ return err;
+}
+
+int
+tap_ctl_allocate(int *minor, char **devname)
+{
+ int err;
+
+ *minor = -1;
+
+ err = tap_ctl_check_environment();
+ if (err)
+ return err;
+
+ err = tap_ctl_allocate_device(minor, devname);
+ if (err)
+ return err;
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_attach(const int id, const int minor)
+{
+ int err;
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(message));
+ message.type = TAPDISK_MESSAGE_ATTACH;
+ message.cookie = minor;
+
+ err = tap_ctl_connect_send_and_receive(id, &message, 5);
+ if (err)
+ return err;
+
+ if (message.type == TAPDISK_MESSAGE_ATTACH_RSP) {
+ err = message.u.response.error;
+ if (err)
+ EPRINTF("attach failed: %d\n", err);
+ } else {
+ EPRINTF("got unexpected result '%s' from %d\n",
+ tapdisk_message_name(message.type), id);
+ err = EINVAL;
+ }
+
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+int
+tap_ctl_check_blktap(const char **msg)
+{
+ FILE *f;
+ int err = 0, minor;
+ char name[32];
+
+ memset(name, 0, sizeof(name));
+
+ f = fopen("/proc/misc", "r");
+ if (!f) {
+ *msg = "failed to open /proc/misc";
+ return -errno;
+ }
+
+ while (fscanf(f, "%d %32s", &minor, name) == 2) {
+ if (!strcmp(name, BLKTAP2_CONTROL_NAME))
+ goto out;
+ }
+
+ err = -ENOSYS;
+ *msg = "blktap kernel module not installed";
+
+out:
+ fclose(f);
+ return err;
+}
+
+int
+tap_ctl_check(const char **msg)
+{
+ int err;
+ uid_t uid;
+
+ err = tap_ctl_check_blktap(msg);
+ if (err)
+ goto out;
+
+ err = 0;
+ *msg = "ok";
+
+out:
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+
+static int
+__tap_ctl_close(const int id, const int minor, const int force)
+{
+ int err;
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(message));
+ message.type = TAPDISK_MESSAGE_CLOSE;
+ if (force)
+ message.type = TAPDISK_MESSAGE_FORCE_SHUTDOWN;
+ message.cookie = minor;
+
+ err = tap_ctl_connect_send_and_receive(id, &message, 5);
+ if (err)
+ return err;
+
+ if (message.type == TAPDISK_MESSAGE_CLOSE_RSP) {
+ err = message.u.response.error;
+ if (err)
+ EPRINTF("close failed: %d\n", err);
+ } else {
+ EPRINTF("got unexpected result '%s' from %d\n",
+ tapdisk_message_name(message.type), id);
+ err = EINVAL;
+ }
+
+ return err;
+}
+
+int
+tap_ctl_close(const int id, const int minor, const int force)
+{
+ int i, err;
+
+ for (i = 0; i < 20; i++) {
+ err = __tap_ctl_close(id, minor, force);
+ if (!err)
+ return 0;
+
+ err = (err < 0 ? -err : err);
+ if (err != EAGAIN) {
+ EPRINTF("close failed: %d\n", err);
+ return err;
+ }
+
+ usleep(1000);
+ }
+
+ EPRINTF("close timed out\n");
+ return EIO;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+int
+tap_ctl_create(const char *params, char **devname)
+{
+ int err, id, minor;
+
+ err = tap_ctl_allocate(&minor, devname);
+ if (err)
+ return err;
+
+ id = tap_ctl_spawn();
+ if (id < 0) {
+ err = id;
+ goto destroy;
+ }
+
+ err = tap_ctl_attach(id, minor);
+ if (err)
+ goto destroy;
+
+ err = tap_ctl_open(id, minor, params);
+ if (err)
+ goto detach;
+
+ return 0;
+
+detach:
+ tap_ctl_detach(id, minor);
+destroy:
+ tap_ctl_free(minor);
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+int
+tap_ctl_destroy(const int id, const int minor)
+{
+ int err;
+
+ err = tap_ctl_close(id, minor, 0);
+ if (err)
+ return err;
+
+ err = tap_ctl_detach(id, minor);
+ if (err)
+ return err;
+
+ err = tap_ctl_free(minor);
+ if (err)
+ return err;
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_detach(const int id, const int minor)
+{
+ int err;
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(message));
+ message.type = TAPDISK_MESSAGE_DETACH;
+ message.cookie = minor;
+
+ err = tap_ctl_connect_send_and_receive(id, &message, 5);
+ if (err)
+ return err;
+
+ if (message.type == TAPDISK_MESSAGE_DETACH_RSP) {
+ err = message.u.response.error;
+ if (err < 0)
+ printf("detach failed: %d\n", err);
+ } else {
+ printf("got unexpected result '%s' from %d\n",
+ tapdisk_message_name(message.type), id);
+ err = EINVAL;
+ }
+
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/ioctl.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+int
+tap_ctl_free(const int minor)
+{
+ int fd, err;
+
+ fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY);
+ if (fd == -1) {
+ EPRINTF("failed to open control device: %d\n", errno);
+ return errno;
+ }
+
+ err = ioctl(fd, BLKTAP2_IOCTL_FREE_TAP, minor);
+ close(fd);
+
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+int tap_ctl_debug = 0;
+
+int
+tap_ctl_read_message(int fd, tapdisk_message_t *message, int timeout)
+{
+ fd_set readfds;
+ int ret, len, offset;
+ struct timeval tv, *t;
+
+ t = NULL;
+ offset = 0;
+ len = sizeof(tapdisk_message_t);
+
+ if (timeout) {
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ t = &tv;
+ }
+
+ memset(message, 0, sizeof(tapdisk_message_t));
+
+ while (offset < len) {
+ FD_ZERO(&readfds);
+ FD_SET(fd, &readfds);
+
+ ret = select(fd + 1, &readfds, NULL, NULL, t);
+ if (ret == -1) {
+ if (errno == EINTR)
+ continue;
+ break;
+ }
+ else if (FD_ISSET(fd, &readfds)) {
+ ret = read(fd, message + offset, len - offset);
+ if (ret <= 0) {
+ if (errno == EINTR)
+ continue;
+ break;
+ }
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len) {
+ EPRINTF("failure reading message\n");
+ return -EIO;
+ }
+
+ DBG("received '%s' message (uuid = %u)\n",
+ tapdisk_message_name(message->type), message->cookie);
+
+ return 0;
+}
+
+int
+tap_ctl_write_message(int fd, tapdisk_message_t *message, int timeout)
+{
+ fd_set writefds;
+ int ret, len, offset;
+ struct timeval tv, *t;
+
+ t = NULL;
+ offset = 0;
+ len = sizeof(tapdisk_message_t);
+
+ if (timeout) {
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ t = &tv;
+ }
+
+ DBG("sending '%s' message (uuid = %u)\n",
+ tapdisk_message_name(message->type), message->cookie);
+
+ while (offset < len) {
+ FD_ZERO(&writefds);
+ FD_SET(fd, &writefds);
+
+ /* we don't bother reinitializing tv. at worst, it will wait a
+ * bit more time than expected. */
+
+ ret = select(fd + 1, NULL, &writefds, NULL, t);
+ if (ret == -1) {
+ if (errno == EINTR)
+ continue;
+ break;
+ }
+ else if (FD_ISSET(fd, &writefds)) {
+ ret = write(fd, message + offset, len - offset);
+ if (ret <= 0) {
+ if (errno == EINTR)
+ continue;
+ break;
+ }
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len) {
+ EPRINTF("failure writing message\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int
+tap_ctl_send_and_receive(int sfd, tapdisk_message_t *message, int timeout)
+{
+ int err;
+
+ err = tap_ctl_write_message(sfd, message, timeout);
+ if (err) {
+ EPRINTF("failed to send '%s' message\n",
+ tapdisk_message_name(message->type));
+ return err;
+ }
+
+ err = tap_ctl_read_message(sfd, message, timeout);
+ if (err) {
+ EPRINTF("failed to receive '%s' message\n",
+ tapdisk_message_name(message->type));
+ return err;
+ }
+
+ return 0;
+}
+
+char *
+tap_ctl_socket_name(int id)
+{
+ char *name;
+
+ if (asprintf(&name, "%s/%s%d",
+ BLKTAP2_CONTROL_DIR, BLKTAP2_CONTROL_SOCKET, id) == -1)
+ return NULL;
+
+ return name;
+}
+
+int
+tap_ctl_connect(const char *name, int *sfd)
+{
+ int fd, err;
+ struct sockaddr_un saddr;
+
+ *sfd = -1;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd == -1) {
+ EPRINTF("couldn't create socket for %s: %d\n", name, errno);
+ return -errno;
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sun_family = AF_UNIX;
+ strcpy(saddr.sun_path, name);
+
+ err = connect(fd, (const struct sockaddr *)&saddr, sizeof(saddr));
+ if (err) {
+ EPRINTF("couldn't connect to %s: %d\n", name, errno);
+ close(fd);
+ return -errno;
+ }
+
+ *sfd = fd;
+ return 0;
+}
+
+int
+tap_ctl_connect_id(int id, int *sfd)
+{
+ int err;
+ char *name;
+
+ *sfd = -1;
+
+ if (id < 0) {
+ EPRINTF("invalid id %d\n", id);
+ return -EINVAL;
+ }
+
+ name = tap_ctl_socket_name(id);
+ if (!name) {
+ EPRINTF("couldn't name socket for %d\n", id);
+ return -ENOMEM;
+ }
+
+ err = tap_ctl_connect(name, sfd);
+ free(name);
+
+ return err;
+}
+
+int
+tap_ctl_connect_send_and_receive(int id, tapdisk_message_t *message, int timeout)
+{
+ int err, sfd;
+
+ err = tap_ctl_connect_id(id, &sfd);
+ if (err)
+ return err;
+
+ err = tap_ctl_send_and_receive(sfd, message, timeout);
+
+ close(sfd);
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <glob.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+#include "list.h"
+
+static void
+free_list(tap_list_t *entry)
+{
+ if (entry->type) {
+ free(entry->type);
+ entry->type = NULL;
+ }
+
+ if (entry->path) {
+ free(entry->path);
+ entry->path = NULL;
+ }
+
+ free(entry);
+}
+
+int
+_parse_params(const char *params, char **type, char **path)
+{
+ char *ptr;
+ size_t len;
+
+ ptr = strchr(params, ':');
+ if (!ptr)
+ return -EINVAL;
+
+ len = ptr - params;
+
+ *type = strndup(params, len);
+ *path = strdup(params + len + 1);
+
+ if (!*type || !*path) {
+ free(*type);
+ *type = NULL;
+
+ free(*path);
+ *path = NULL;
+
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int
+init_list(tap_list_t *entry,
+ int tap_id, pid_t tap_pid, int vbd_minor, int vbd_state,
+ const char *params)
+{
+ int err = 0;
+
+ entry->id = tap_id;
+ entry->pid = tap_pid;
+ entry->minor = vbd_minor;
+ entry->state = vbd_state;
+
+ if (params)
+ err = _parse_params(params, &entry->type, &entry->path);
+
+ return err;
+}
+
+void
+tap_ctl_free_list(tap_list_t **list)
+{
+ tap_list_t **_entry;
+
+ for (_entry = list; *_entry != NULL; ++_entry)
+ free_list(*_entry);
+
+ free(list);
+}
+
+static tap_list_t**
+tap_ctl_alloc_list(int n)
+{
+ tap_list_t **list, *entry;
+ size_t size;
+ int i;
+
+ size = sizeof(tap_list_t*) * (n+1);
+ list = malloc(size);
+ if (!list)
+ goto fail;
+
+ memset(list, 0, size);
+
+ for (i = 0; i < n; ++i) {
+ tap_list_t *entry;
+
+ entry = malloc(sizeof(tap_list_t));
+ if (!entry)
+ goto fail;
+
+ memset(entry, 0, sizeof(tap_list_t));
+
+ list[i] = entry;
+ }
+
+ return list;
+
+fail:
+ if (list)
+ tap_ctl_free_list(list);
+
+ return NULL;
+}
+
+static int
+tap_ctl_list_length(const tap_list_t **list)
+{
+ const tap_list_t **_entry;
+ int n;
+
+ n = 0;
+ for (_entry = list; *_entry != NULL; ++_entry)
+ n++;
+
+ return n;
+}
+
+static int
+_tap_minor_cmp(const void *a, const void *b)
+{
+ return *(int*)a - *(int*)b;
+}
+
+int
+_tap_ctl_find_minors(int **_minorv)
+{
+ glob_t glbuf = { 0 };
+ const char *pattern, *format;
+ int *minorv = NULL, n_minors = 0;
+ int err, i;
+
+ pattern = BLKTAP2_SYSFS_DIR"/blktap*";
+ format = BLKTAP2_SYSFS_DIR"/blktap%d";
+
+ n_minors = 0;
+ minorv = NULL;
+
+ err = glob(pattern, 0, NULL, &glbuf);
+ switch (err) {
+ case GLOB_NOMATCH:
+ goto done;
+
+ case GLOB_ABORTED:
+ case GLOB_NOSPACE:
+ err = -errno;
+ EPRINTF("%s: glob failed, err %d", pattern, err);
+ goto fail;
+ }
+
+ minorv = malloc(sizeof(int) * glbuf.gl_pathc);
+ if (!minorv) {
+ err = -errno;
+ goto fail;
+ }
+
+ for (i = 0; i < glbuf.gl_pathc; ++i) {
+ int n;
+
+ n = sscanf(glbuf.gl_pathv[i], format, &minorv[n_minors]);
+ if (n != 1)
+ continue;
+
+ n_minors++;
+ }
+
+ qsort(minorv, n_minors, sizeof(int), _tap_minor_cmp);
+
+done:
+ *_minorv = minorv;
+ err = 0;
+
+out:
+ if (glbuf.gl_pathv)
+ globfree(&glbuf);
+
+ return err ? : n_minors;
+
+fail:
+ if (minorv)
+ free(minorv);
+
+ goto out;
+}
+
+struct tapdisk {
+ int id;
+ pid_t pid;
+ struct list_head list;
+};
+
+static int
+_tap_tapdisk_cmp(const void *a, const void *b)
+{
+ return ((struct tapdisk*)a)->id - ((struct tapdisk*)b)->id;
+}
+
+int
+_tap_ctl_find_tapdisks(struct tapdisk **_tapv)
+{
+ glob_t glbuf = { 0 };
+ const char *pattern, *format;
+ struct tapdisk *tapv = NULL;
+ int err, i, n_taps = 0;
+
+ pattern = BLKTAP2_CONTROL_DIR"/"BLKTAP2_CONTROL_SOCKET"*";
+ format = BLKTAP2_CONTROL_DIR"/"BLKTAP2_CONTROL_SOCKET"%d";
+
+ n_taps = 0;
+ tapv = NULL;
+
+ err = glob(pattern, 0, NULL, &glbuf);
+ switch (err) {
+ case GLOB_NOMATCH:
+ goto done;
+
+ case GLOB_ABORTED:
+ case GLOB_NOSPACE:
+ err = -errno;
+ EPRINTF("%s: glob failed, err %d", pattern, err);
+ goto fail;
+ }
+
+ tapv = malloc(sizeof(struct tapdisk) * glbuf.gl_pathc);
+ if (!tapv) {
+ err = -errno;
+ goto fail;
+ }
+
+ for (i = 0; i < glbuf.gl_pathc; ++i) {
+ struct tapdisk *tap;
+ int n;
+
+ tap = &tapv[n_taps];
+
+ err = sscanf(glbuf.gl_pathv[i], format, &tap->id);
+ if (err != 1)
+ continue;
+
+ tap->pid = tap_ctl_get_pid(tap->id);
+ if (tap->pid < 0)
+ continue;
+
+ n_taps++;
+ }
+
+ qsort(tapv, n_taps, sizeof(struct tapdisk), _tap_tapdisk_cmp);
+
+ for (i = 0; i < n_taps; ++i)
+ INIT_LIST_HEAD(&tapv[i].list);
+
+done:
+ *_tapv = tapv;
+ err = 0;
+
+out:
+ if (glbuf.gl_pathv)
+ globfree(&glbuf);
+
+ return err ? : n_taps;
+
+fail:
+ if (tapv)
+ free(tapv);
+
+ goto out;
+}
+
+struct tapdisk_list {
+ int minor;
+ int state;
+ char *params;
+ struct list_head entry;
+};
+
+int
+_tap_ctl_list_tapdisk(int id, struct list_head *_list)
+{
+ tapdisk_message_t message;
+ struct list_head list;
+ struct tapdisk_list *tl, *next;
+ int err, sfd;
+
+ err = tap_ctl_connect_id(id, &sfd);
+ if (err)
+ return err;
+
+ memset(&message, 0, sizeof(message));
+ message.type = TAPDISK_MESSAGE_LIST;
+ message.cookie = -1;
+
+ err = tap_ctl_write_message(sfd, &message, 2);
+ if (err)
+ return err;
+
+ INIT_LIST_HEAD(&list);
+ do {
+ err = tap_ctl_read_message(sfd, &message, 2);
+ if (err) {
+ err = -EPROTO;
+ break;
+ }
+
+ if (message.u.list.count == 0)
+ break;
+
+ tl = malloc(sizeof(struct tapdisk_list));
+ if (!tl) {
+ err = -ENOMEM;
+ break;
+ }
+
+ tl->minor = message.u.list.minor;
+ tl->state = message.u.list.state;
+ if (message.u.list.path[0] != 0) {
+ tl->params = strndup(message.u.list.path,
+ sizeof(message.u.list.path));
+ if (!tl->params) {
+ err = -errno;
+ break;
+ }
+ } else
+ tl->params = NULL;
+
+ list_add(&tl->entry, &list);
+ } while (1);
+
+ if (err)
+ list_for_each_entry_safe(tl, next, &list, entry) {
+ list_del(&tl->entry);
+ free(tl->params);
+ free(tl);
+ }
+
+ close(sfd);
+ list_splice(&list, _list);
+ return err;
+}
+
+void
+_tap_ctl_free_tapdisks(struct tapdisk *tapv, int n_taps)
+{
+ struct tapdisk *tap;
+
+ for (tap = tapv; tap < &tapv[n_taps]; ++tap) {
+ struct tapdisk_list *tl, *next;
+
+ list_for_each_entry_safe(tl, next, &tap->list, entry) {
+ free(tl->params);
+ free(tl);
+ }
+ }
+
+ free(tapv);
+}
+
+int
+_tap_list_join3(int n_minors, int *minorv, int n_taps, struct tapdisk *tapv,
+ tap_list_t ***_list)
+{
+ tap_list_t **list, **_entry;
+ int i, _m, err;
+
+ list = tap_ctl_alloc_list(n_minors + n_taps);
+ if (!list) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ _entry = list;
+
+ for (i = 0; i < n_taps; ++i) {
+ struct tapdisk *tap = &tapv[i];
+ struct tapdisk_list *tl;
+
+ /* orphaned tapdisk */
+ if (list_empty(&tap->list)) {
+ err = init_list(*_entry++, tap->id, tap->pid, -1, -1, NULL);
+ if (err)
+ goto fail;
+ continue;
+ }
+
+ list_for_each_entry(tl, &tap->list, entry) {
+
+ err = init_list(*_entry++,
+ tap->id, tap->pid,
+ tl->minor, tl->state, tl->params);
+ if (err)
+ goto fail;
+
+ if (tl->minor >= 0) {
+ /* clear minor */
+ for (_m = 0; _m < n_minors; ++_m) {
+ if (minorv[_m] == tl->minor) {
+ minorv[_m] = -1;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /* orphaned minors */
+ for (_m = 0; _m < n_minors; ++_m) {
+ int minor = minorv[_m];
+ if (minor >= 0) {
+ err = init_list(*_entry++, -1, -1, minor, -1, NULL);
+ if (err)
+ goto fail;
+ }
+ }
+
+ /* free extraneous list entries */
+ for (; *_entry != NULL; ++_entry) {
+ free_list(*_entry);
+ *_entry = NULL;
+ }
+
+ *_list = list;
+
+ return 0;
+
+fail:
+ if (list)
+ tap_ctl_free_list(list);
+
+ return err;
+}
+
+int
+tap_ctl_list(tap_list_t ***list)
+{
+ int n_taps, n_minors, err, *minorv;
+ struct tapdisk *tapv, *tap;
+
+ n_taps = -1;
+ n_minors = -1;
+
+ err = n_minors = _tap_ctl_find_minors(&minorv);
+ if (err < 0)
+ goto out;
+
+ err = n_taps = _tap_ctl_find_tapdisks(&tapv);
+ if (err < 0)
+ goto out;
+
+ for (tap = tapv; tap < &tapv[n_taps]; ++tap) {
+ err = _tap_ctl_list_tapdisk(tap->id, &tap->list);
+ if (err)
+ goto out;
+ }
+
+ err = _tap_list_join3(n_minors, minorv, n_taps, tapv, list);
+
+out:
+ if (n_taps > 0)
+ _tap_ctl_free_tapdisks(tapv, n_taps);
+
+ if (n_minors > 0)
+ free(minorv);
+
+ return err;
+}
+
+int
+tap_ctl_find(const char *type, const char *path, tap_list_t *tap)
+{
+ tap_list_t **list, **_entry;
+ int ret = -ENOENT, err;
+
+ err = tap_ctl_list(&list);
+ if (err)
+ return err;
+
+ for (_entry = list; *_entry != NULL; ++_entry) {
+ tap_list_t *entry = *_entry;
+
+ if (type && (!entry->type || strcmp(entry->type, type)))
+ continue;
+
+ if (path && (!entry->path || strcmp(entry->path, path)))
+ continue;
+
+ *tap = *entry;
+ tap->type = tap->path = NULL;
+ ret = 0;
+ break;
+ }
+
+ tap_ctl_free_list(list);
+
+ return ret;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_blk_major(void)
+{
+ FILE *devices;
+ int rv, major;
+
+ devices = fopen("/proc/devices", "r");
+ if (!devices) {
+ rv = -errno;
+ goto out;
+ }
+
+ do {
+ char buf[32], *s;
+ int n, offset;
+
+ s = fgets(buf, sizeof(buf), devices);
+ if (!s)
+ break;
+
+ major = -ENODEV;
+ offset = 0;
+
+ n = sscanf(buf, "%d tapdev%n", &major, &offset);
+ if (n == 1 && offset)
+ break;
+ } while (1);
+
+ rv = major;
+
+out:
+ if (devices)
+ fclose(devices);
+
+ return rv;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+#include "blktaplib.h"
+
+int
+tap_ctl_open(const int id, const int minor, const char *params)
+{
+ int err;
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(message));
+ message.type = TAPDISK_MESSAGE_OPEN;
+ message.cookie = minor;
+ message.u.params.storage = TAPDISK_STORAGE_TYPE_DEFAULT;
+ message.u.params.devnum = minor;
+
+ err = snprintf(message.u.params.path,
+ sizeof(message.u.params.path) - 1, "%s", params);
+ if (err >= sizeof(message.u.params.path)) {
+ EPRINTF("name too long\n");
+ return ENAMETOOLONG;
+ }
+
+ err = tap_ctl_connect_send_and_receive(id, &message, 5);
+ if (err)
+ return err;
+
+ switch (message.type) {
+ case TAPDISK_MESSAGE_OPEN_RSP:
+ break;
+ case TAPDISK_MESSAGE_ERROR:
+ err = -message.u.response.error;
+ EPRINTF("open failed, err %d\n", err);
+ break;
+ default:
+ EPRINTF("got unexpected result '%s' from %d\n",
+ tapdisk_message_name(message.type), id);
+ err = EINVAL;
+ }
+
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_pause(const int id, const int minor)
+{
+ int err;
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(message));
+ message.type = TAPDISK_MESSAGE_PAUSE;
+ message.cookie = minor;
+
+ err = tap_ctl_connect_send_and_receive(id, &message, 5);
+ if (err)
+ return err;
+
+ if (message.type == TAPDISK_MESSAGE_PAUSE_RSP)
+ err = message.u.response.error;
+ else {
+ err = EINVAL;
+ EPRINTF("got unexpected result '%s' from %d\n",
+ tapdisk_message_name(message.type), id);
+ }
+
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+
+#include "tap-ctl.h"
+#include "blktap2.h"
+
+static pid_t
+__tap_ctl_spawn(int *readfd)
+{
+ int err, child, channel[2];
+ char *tapdisk;
+
+ if (pipe(channel)) {
+ EPRINTF("pipe failed: %d\n", errno);
+ return -errno;
+ }
+
+ if ((child = fork()) == -1) {
+ EPRINTF("fork failed: %d\n", errno);
+ return -errno;
+ }
+
+ if (child) {
+ close(channel[1]);
+ *readfd = channel[0];
+ return child;
+ }
+
+ if (dup2(channel[1], STDOUT_FILENO) == -1) {
+ EPRINTF("dup2 failed: %d\n", errno);
+ exit(errno);
+ }
+
+ if (dup2(channel[1], STDERR_FILENO) == -1) {
+ EPRINTF("dup2 failed: %d\n", errno);
+ exit(errno);
+ }
+
+ close(channel[0]);
+ close(channel[1]);
+
+ tapdisk = getenv("TAPDISK2");
+ if (!tapdisk)
+ tapdisk = "tapdisk2";
+
+ execlp(tapdisk, tapdisk, NULL);
+
+ EPRINTF("exec failed\n");
+ exit(1);
+}
+
+pid_t
+tap_ctl_get_pid(const int id)
+{
+ int err;
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(message));
+ message.type = TAPDISK_MESSAGE_PID;
+
+ err = tap_ctl_connect_send_and_receive(id, &message, 2);
+ if (err)
+ return err;
+
+ return message.u.tapdisk_pid;
+}
+
+static int
+tap_ctl_wait(pid_t child)
+{
+ pid_t pid;
+ int status;
+
+ pid = waitpid(child, &status, 0);
+ if (pid < 0) {
+ EPRINTF("wait(%d) failed, err %d\n", child, errno);
+ return -errno;
+ }
+
+ if (WIFEXITED(status)) {
+ int code = WEXITSTATUS(status);
+ if (code)
+ EPRINTF("tapdisk2[%d] failed, status %d\n", child, code);
+ return -code;
+ }
+
+ if (WIFSIGNALED(status)) {
+ int signo = WTERMSIG(status);
+ EPRINTF("tapdisk2[%d] killed by signal %d\n", child, signo);
+ return -EINTR;
+ }
+
+ EPRINTF("tapdisk2[%d]: unexpected status %#x\n", child, status);
+ return -EAGAIN;
+}
+
+static int
+tap_ctl_get_child_id(int readfd)
+{
+ int id;
+ FILE *f;
+
+ f = fdopen(readfd, "r");
+ if (!f) {
+ EPRINTF("fdopen failed: %d\n", errno);
+ return -1;
+ }
+
+ errno = 0;
+ if (fscanf(f, BLKTAP2_CONTROL_DIR"/"
+ BLKTAP2_CONTROL_SOCKET"%d", &id) != 1) {
+ errno = (errno ? : EINVAL);
+ EPRINTF("parsing id failed: %d\n", errno);
+ id = -1;
+ }
+
+ fclose(f);
+ return id;
+}
+
+int
+tap_ctl_spawn(void)
+{
+ pid_t child;
+ int err, id, readfd;
+
+ readfd = -1;
+
+ child = __tap_ctl_spawn(&readfd);
+ if (child < 0)
+ return child;
+
+ err = tap_ctl_wait(child);
+ if (err)
+ return err;
+
+ id = tap_ctl_get_child_id(readfd);
+ if (id < 0)
+ EPRINTF("get_id failed, child %d err %d\n", child, errno);
+
+ return id;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_unpause(const int id, const int minor, const char *params)
+{
+ int err;
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(message));
+ message.type = TAPDISK_MESSAGE_RESUME;
+ message.cookie = minor;
+
+ if (params)
+ strncpy(message.u.params.path, params,
+ sizeof(message.u.params.path) - 1);
+
+ err = tap_ctl_connect_send_and_receive(id, &message, 15);
+ if (err)
+ return err;
+
+ if (message.type == TAPDISK_MESSAGE_RESUME_RSP)
+ err = message.u.response.error;
+ else {
+ err = EINVAL;
+ EPRINTF("got unexpected result '%s' from %d\n",
+ tapdisk_message_name(message.type), id);
+ }
+
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include "tap-ctl.h"
+
+typedef int (*tap_ctl_func_t) (int, char **);
+
+struct command {
+ char *name;
+ tap_ctl_func_t func;
+};
+
+static void
+tap_cli_list_usage(FILE *stream)
+{
+ fprintf(stream,
+ "usage: list [-h] [-p pid] [-m minor] [-t type] [-f file]\n");
+}
+
+static void
+tap_ctl_list_row(tap_list_t *entry)
+{
+ char minor_str[10] = "-";
+ char state_str[10] = "-";
+ char pid_str[10] = "-";
+
+ if (entry->pid != -1)
+ sprintf(pid_str, "%d", entry->pid);
+
+ if (entry->minor != -1)
+ sprintf(minor_str, "%d", entry->minor);
+
+ if (entry->state != -1)
+ sprintf(state_str, "%x", entry->state);
+
+ printf("%8s %2s %4s %10s %s\n",
+ pid_str, minor_str, state_str,
+ entry->type ? : "-", entry->path ? : "-");
+}
+
+static void
+tap_ctl_list_dict(tap_list_t *entry)
+{
+ int d = 0;
+
+ if (entry->pid != -1) {
+ if (d) putc(' ', stdout);
+ d = printf("pid=%d", entry->pid);
+ }
+
+ if (entry->minor != -1) {
+ if (d) putc(' ', stdout);
+ d = printf("minor=%d", entry->minor);
+ }
+
+ if (entry->state != -1) {
+ if (d) putc(' ', stdout);
+ d = printf("state=%d", entry->state);
+ }
+
+ if (entry->type && entry->path) {
+ if (d) putc(' ', stdout);
+ d = printf("args=%s:%s", entry->type, entry->path);
+ }
+
+ putc('\n', stdout);
+}
+
+int
+tap_cli_list(int argc, char **argv)
+{
+ tap_list_t **list, **_entry;
+ int c, minor, tty, err;
+ const char *type, *file;
+ pid_t pid;
+
+ err = tap_ctl_list(&list);
+ if (err)
+ return -err;
+
+ pid = -1;
+ minor = -1;
+ type = NULL;
+ file = NULL;
+
+ while ((c = getopt(argc, argv, "m:p:t:f:h")) != -1) {
+ switch (c) {
+ case 'm':
+ minor = atoi(optarg);
+ break;
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 't':
+ type = optarg;
+ break;
+ case 'f':
+ file = optarg;
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_list_usage(stdout);
+ return 0;
+ }
+ }
+
+ tty = isatty(STDOUT_FILENO);
+
+ for (_entry = list; *_entry != NULL; ++_entry) {
+ tap_list_t *entry = *_entry;
+
+ if (minor >= 0 && entry->minor != minor)
+ continue;
+
+ if (pid >= 0 && entry->pid != pid)
+ continue;
+
+ if (type && (!entry->type || strcmp(entry->type, type)))
+ continue;
+
+ if (file && (!entry->path || strcmp(entry->path, file)))
+ continue;
+
+ if (tty)
+ tap_ctl_list_row(entry);
+ else
+ tap_ctl_list_dict(entry);
+ }
+
+ tap_ctl_free_list(list);
+
+ return 0;
+
+usage:
+ tap_cli_list_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_allocate_usage(FILE *stream)
+{
+ fprintf(stream, "usage: allocate [-d device name]>\n");
+}
+
+static int
+tap_cli_allocate(int argc, char **argv)
+{
+ char *devname;
+ int c, minor, err;
+
+ devname = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "d:h")) != -1) {
+ switch (c) {
+ case 'd':
+ devname = optarg;
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_allocate_usage(stdout);
+ return 0;
+ }
+ }
+
+ err = tap_ctl_allocate(&minor, &devname);
+ if (!err)
+ printf("%s\n", devname);
+
+ return err;
+
+usage:
+ tap_cli_allocate_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_free_usage(FILE *stream)
+{
+ fprintf(stream, "usage: free <-m minor>\n");
+}
+
+static int
+tap_cli_free(int argc, char **argv)
+{
+ int c, minor;
+
+ minor = -1;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "m:h")) != -1) {
+ switch (c) {
+ case 'm':
+ minor = atoi(optarg);
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_free_usage(stdout);
+ return 0;
+ }
+ }
+
+ if (minor == -1)
+ goto usage;
+
+ return tap_ctl_free(minor);
+
+usage:
+ tap_cli_free_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_create_usage(FILE *stream)
+{
+ fprintf(stream, "usage: create <-a args> [-d device name]\n");
+}
+
+static int
+tap_cli_create(int argc, char **argv)
+{
+ int c, err;
+ char *args, *devname;
+
+ args = NULL;
+ devname = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "a:d:h")) != -1) {
+ switch (c) {
+ case 'a':
+ args = optarg;
+ break;
+ case 'd':
+ devname = optarg;
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_create_usage(stdout);
+ return 0;
+ }
+ }
+
+ if (!args)
+ goto usage;
+
+ err = tap_ctl_create(args, &devname);
+ if (!err)
+ printf("%s\n", devname);
+
+ return err;
+
+usage:
+ tap_cli_create_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_destroy_usage(FILE *stream)
+{
+ fprintf(stream, "usage: destroy <-p pid> <-m minor>\n");
+}
+
+static int
+tap_cli_destroy(int argc, char **argv)
+{
+ int c, pid, minor;
+
+ pid = -1;
+ minor = -1;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "p:m:h")) != -1) {
+ switch (c) {
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 'm':
+ minor = atoi(optarg);
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_destroy_usage(stdout);
+ return 0;
+ }
+ }
+
+ if (pid == -1 || minor == -1)
+ goto usage;
+
+ return tap_ctl_destroy(pid, minor);
+
+usage:
+ tap_cli_destroy_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_spawn_usage(FILE *stream)
+{
+ fprintf(stream, "usage: spawn\n");
+}
+
+static int
+tap_cli_spawn(int argc, char **argv)
+{
+ int c;
+ pid_t task;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "h")) != -1) {
+ switch (c) {
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_spawn_usage(stdout);
+ return 0;
+ }
+ }
+
+ task = tap_ctl_spawn();
+ if (task < 0) {
+ printf("spawn failed: %d\n", errno);
+ return task;
+ }
+
+ printf("tapdisk spawned with pid %d\n", task);
+ return 0;
+
+usage:
+ tap_cli_spawn_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_attach_usage(FILE *stream)
+{
+ fprintf(stream, "usage: attach <-p pid> <-m minor>\n");
+}
+
+static int
+tap_cli_attach(int argc, char **argv)
+{
+ int c, pid, minor;
+
+ pid = -1;
+ minor = -1;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "p:m:h")) != -1) {
+ switch (c) {
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 'm':
+ minor = atoi(optarg);
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_attach_usage(stderr);
+ return 0;
+ }
+ }
+
+ if (pid == -1 || minor == -1)
+ goto usage;
+
+ return tap_ctl_attach(pid, minor);
+
+usage:
+ tap_cli_attach_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_detach_usage(FILE *stream)
+{
+ fprintf(stream, "usage: detach <-p pid> <-m minor>\n");
+}
+
+static int
+tap_cli_detach(int argc, char **argv)
+{
+ int c, pid, minor;
+
+ pid = -1;
+ minor = -1;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "p:m:h")) != -1) {
+ switch (c) {
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 'm':
+ minor = atoi(optarg);
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_detach_usage(stdout);
+ return 0;
+ }
+ }
+
+ if (pid == -1 || minor == -1)
+ goto usage;
+
+ return tap_ctl_detach(pid, minor);
+
+usage:
+ tap_cli_detach_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_close_usage(FILE *stream)
+{
+ fprintf(stream, "usage: close <-p pid> <-m minor> [-f force]\n");
+}
+
+static int
+tap_cli_close(int argc, char **argv)
+{
+ int c, pid, minor, force;
+
+ pid = -1;
+ minor = -1;
+ force = 0;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "p:m:fh")) != -1) {
+ switch (c) {
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 'm':
+ minor = atoi(optarg);
+ break;
+ case 'f':
+ force = -1;
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_close_usage(stdout);
+ return 0;
+ }
+ }
+
+ if (pid == -1 || minor == -1)
+ goto usage;
+
+ return tap_ctl_close(pid, minor, force);
+
+usage:
+ tap_cli_close_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_pause_usage(FILE *stream)
+{
+ fprintf(stream, "usage: pause <-p pid> <-m minor>\n");
+}
+
+static int
+tap_cli_pause(int argc, char **argv)
+{
+ int c, pid, minor;
+
+ pid = -1;
+ minor = -1;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "p:m:h")) != -1) {
+ switch (c) {
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 'm':
+ minor = atoi(optarg);
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_pause_usage(stdout);
+ return 0;
+ }
+ }
+
+ if (pid == -1 || minor == -1)
+ goto usage;
+
+ return tap_ctl_pause(pid, minor);
+
+usage:
+ tap_cli_pause_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_unpause_usage(FILE *stream)
+{
+ fprintf(stream, "usage: unpause <-p pid> <-m minor> [-a args]\n");
+}
+
+int
+tap_cli_unpause(int argc, char **argv)
+{
+ const char *args;
+ int c, pid, minor;
+
+ pid = -1;
+ minor = -1;
+ args = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "p:m:a:h")) != -1) {
+ switch (c) {
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 'm':
+ minor = atoi(optarg);
+ break;
+ case 'a':
+ args = optarg;
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_unpause_usage(stdout);
+ return 0;
+ }
+ }
+
+ if (pid == -1 || minor == -1)
+ goto usage;
+
+ return tap_ctl_unpause(pid, minor, args);
+
+usage:
+ tap_cli_unpause_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_major_usage(FILE *stream)
+{
+ fprintf(stream, "usage: major [-h]\n");
+}
+
+static int
+tap_cli_major(int argc, char **argv)
+{
+ int c, chr, major;
+
+ chr = 0;
+
+ while ((c = getopt(argc, argv, "bch")) != -1) {
+ switch (c) {
+ case 'b':
+ chr = 0;
+ break;
+ case 'c':
+ chr = 1;
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_major_usage(stdout);
+ return 0;
+ default:
+ goto usage;
+ }
+ }
+
+ if (chr)
+ major = -EINVAL;
+ else
+ major = tap_ctl_blk_major();
+
+ if (major < 0)
+ return -major;
+
+ printf("%d\n", major);
+
+ return 0;
+
+usage:
+ tap_cli_major_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_open_usage(FILE *stream)
+{
+ fprintf(stream, "usage: open <-p pid> <-m minor> <-a args>\n");
+}
+
+static int
+tap_cli_open(int argc, char **argv)
+{
+ const char *args;
+ int c, pid, minor;
+
+ pid = -1;
+ minor = -1;
+ args = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "a:m:p:h")) != -1) {
+ switch (c) {
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 'm':
+ minor = atoi(optarg);
+ break;
+ case 'a':
+ args = optarg;
+ break;
+ case '?':
+ goto usage;
+ case 'h':
+ tap_cli_open_usage(stdout);
+ return 0;
+ }
+ }
+
+ if (pid == -1 || minor == -1 || !args)
+ goto usage;
+
+ return tap_ctl_open(pid, minor, args);
+
+usage:
+ tap_cli_open_usage(stderr);
+ return EINVAL;
+}
+
+static void
+tap_cli_check_usage(FILE *stream)
+{
+ fprintf(stream, "usage: check\n"
+ "(checks whether environment is suitable for tapdisk2)\n");
+}
+
+static int
+tap_cli_check(int argc, char **argv)
+{
+ int err;
+ const char *msg;
+
+ if (argc != 1)
+ goto usage;
+
+ err = tap_ctl_check(&msg);
+ printf("%s\n", msg);
+
+ return err;
+
+usage:
+ tap_cli_check_usage(stderr);
+ return EINVAL;
+}
+
+struct command commands[] = {
+ { .name = "list", .func = tap_cli_list },
+ { .name = "allocate", .func = tap_cli_allocate },
+ { .name = "free", .func = tap_cli_free },
+ { .name = "create", .func = tap_cli_create },
+ { .name = "destroy", .func = tap_cli_destroy },
+ { .name = "spawn", .func = tap_cli_spawn },
+ { .name = "attach", .func = tap_cli_attach },
+ { .name = "detach", .func = tap_cli_detach },
+ { .name = "open", .func = tap_cli_open },
+ { .name = "close", .func = tap_cli_close },
+ { .name = "pause", .func = tap_cli_pause },
+ { .name = "unpause", .func = tap_cli_unpause },
+ { .name = "major", .func = tap_cli_major },
+ { .name = "check", .func = tap_cli_check },
+};
+
+#define print_commands() \
+ do { \
+ int i, n; \
+ n = sizeof(commands) / sizeof(struct command); \
+ printf("COMMAND := { "); \
+ printf("%s", commands[0].name); \
+ for (i = 1; i < n; i++) \
+ printf(" | %s", commands[i].name); \
+ printf(" }\n"); \
+ } while (0)
+
+void
+help(void)
+{
+ printf("usage: tap-ctl COMMAND [OPTIONS]\n");
+ print_commands();
+ exit(0);
+}
+
+struct command *
+get_command(char *command)
+{
+ int i, n;
+
+ if (strnlen(command, 25) >= 25)
+ return NULL;
+
+ n = sizeof(commands) / sizeof (struct command);
+
+ for (i = 0; i < n; i++)
+ if (!strcmp(command, commands[i].name))
+ return &commands[i];
+
+ return NULL;
+}
+
+int
+main(int argc, char *argv[])
+{
+ char **cargv;
+ const char *msg;
+ struct command *cmd;
+ int cargc, i, cnt, ret;
+
+#ifdef CORE_DUMP
+ #include <sys/resource.h>
+ struct rlimit rlim;
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+ if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+ PERROR("setrlimit failed");
+#endif
+
+ ret = 0;
+
+ if (argc < 2)
+ help();
+
+ cargc = argc - 1;
+ cmd = get_command(argv[1]);
+ if (!cmd) {
+ EPRINTF("invalid COMMAND %s", argv[1]);
+ help();
+ }
+
+ ret = tap_ctl_check(&msg);
+ if (ret) {
+ printf("%s\n", msg);
+ return ret;
+ }
+
+ cargv = malloc(sizeof(char *) * cargc);
+ if (!cargv)
+ exit(ENOMEM);
+
+ cnt = 1;
+ cargv[0] = cmd->name;
+ for (i = 1; i < cargc; i++) {
+ char *arg = argv[i + (argc - cargc)];
+
+ if (!strcmp(arg, "--debug")) {
+ tap_ctl_debug = 1;
+ continue;
+ }
+
+ cargv[cnt++] = arg;
+ }
+
+ ret = cmd->func(cnt, cargv);
+
+ free(cargv);
+
+ return (ret >= 0 ? ret : -ret);
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __TAP_CTL_H__
+#define __TAP_CTL_H__
+
+#include <syslog.h>
+#include <errno.h>
+#include <tapdisk-message.h>
+
+extern int tap_ctl_debug;
+
+#ifdef TAPCTL
+#define DBG(_f, _a...) \
+ do { \
+ if (tap_ctl_debug) \
+ printf(_f, ##_a); \
+ } while (0)
+
+#define DPRINTF(_f, _a...) syslog(LOG_INFO, _f, ##_a)
+#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a)
+#define PERROR(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f ": %s", __func__, ##_a, \
+ strerror(errno))
+#endif
+
+void tap_ctl_version(int *major, int *minor);
+int tap_ctl_kernel_version(int *major, int *minor);
+
+int tap_ctl_check_blktap(const char **message);
+int tap_ctl_check_version(const char **message);
+int tap_ctl_check(const char **message);
+
+int tap_ctl_connect(const char *path, int *socket);
+int tap_ctl_connect_id(int id, int *socket);
+int tap_ctl_read_message(int fd, tapdisk_message_t *message, int timeout);
+int tap_ctl_write_message(int fd, tapdisk_message_t *message, int timeout);
+int tap_ctl_send_and_receive(int fd, tapdisk_message_t *message, int timeout);
+int tap_ctl_connect_send_and_receive(int id,
+ tapdisk_message_t *message, int timeout);
+char *tap_ctl_socket_name(int id);
+
+typedef struct {
+ int id;
+ pid_t pid;
+ int minor;
+ int state;
+ char *type;
+ char *path;
+} tap_list_t;
+
+int tap_ctl_get_driver_id(const char *handle);
+
+int tap_ctl_list(tap_list_t ***list);
+void tap_ctl_free_list(tap_list_t **list);
+int tap_ctl_find(const char *type, const char *path, tap_list_t *tap);
+
+int tap_ctl_allocate(int *minor, char **devname);
+int tap_ctl_free(const int minor);
+
+int tap_ctl_create(const char *params, char **devname);
+int tap_ctl_destroy(const int id, const int minor);
+
+int tap_ctl_spawn(void);
+pid_t tap_ctl_get_pid(const int id);
+
+int tap_ctl_attach(const int id, const int minor);
+int tap_ctl_detach(const int id, const int minor);
+
+int tap_ctl_open(const int id, const int minor, const char *params);
+int tap_ctl_close(const int id, const int minor, const int force);
+
+int tap_ctl_pause(const int id, const int minor);
+int tap_ctl_unpause(const int id, const int minor, const char *params);
+
+int tap_ctl_blk_major(void);
+
+#endif
--- /dev/null
+XEN_ROOT=$(CURDIR)/../../..
+BLKTAP_ROOT= ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+LIBVHDDIR = $(BLKTAP_ROOT)/vhd/lib
+
+IBIN = tapdisk2 td-util tapdisk-client tapdisk-stream tapdisk-diff
+QCOW_UTIL = img2qcow qcow-create qcow2raw
+LOCK_UTIL = lock-util
+INST_DIR = $(sbindir)
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -fno-strict-aliasing
+CFLAGS += -I$(BLKTAP_ROOT)/include -I$(BLKTAP_ROOT)/drivers
+CFLAGS += $(CFLAGS_libxenctrl)
+CFLAGS += -D_GNU_SOURCE
+CFLAGS += -DUSE_NFS_LOCKS
+# drivers/block-log.c incorrectly uses libxc internals
+CFLAGS += -I$(XEN_ROOT)/tools/libxc
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS += -fPIC
+endif
+
+VHDLIBS := -L$(LIBVHDDIR) -lvhd
+
+REMUS-OBJS := block-remus.o
+REMUS-OBJS += hashtable.o
+REMUS-OBJS += hashtable_itr.o
+REMUS-OBJS += hashtable_utility.o
+
+tapdisk2 tapdisk-stream tapdisk-diff $(QCOW_UTIL): AIOLIBS := -laio
+
+MEMSHRLIBS :=
+ifeq ($(CONFIG_Linux), __fixme__)
+MEMSHR_DIR = $(XEN_ROOT)/tools/memshr
+CFLAGS += -DMEMSHR
+CFLAGS += -I $(MEMSHR_DIR)
+MEMSHRLIBS += -L$(XEN_ROOT)/tools/libxc -lxenctrl $(MEMSHR_DIR)/libmemshr.a
+endif
+
+ifeq ($(VHD_STATIC),y)
+td-util: CFLAGS += -static
+endif
+
+PORTABLE-OBJS-y :=
+PORTABLE-OBJS-$(CONFIG_Linux) += blk_linux.o
+PORTABLE-OBJS-$(CONFIG_NetBSD) += blk_netbsd.o
+
+TAP-OBJS-y := scheduler.o
+TAP-OBJS-y += tapdisk-vbd.o
+TAP-OBJS-y += tapdisk-control.o
+TAP-OBJS-y += tapdisk-image.o
+TAP-OBJS-y += tapdisk-driver.o
+TAP-OBJS-y += tapdisk-disktype.o
+TAP-OBJS-y += tapdisk-interface.o
+TAP-OBJS-y += tapdisk-server.o
+TAP-OBJS-y += tapdisk-queue.o
+TAP-OBJS-y += tapdisk-filter.o
+TAP-OBJS-y += tapdisk-log.o
+TAP-OBJS-y += tapdisk-utils.o
+TAP-OBJS-y += io-optimize.o
+TAP-OBJS-y += lock.o
+TAP-OBJS-y += $(PORTABLE-OBJS-y)
+
+MISC-OBJS-y := atomicio.o
+
+BLK-OBJS-y := block-aio.o
+BLK-OBJS-y += block-ram.o
+BLK-OBJS-y += block-cache.o
+BLK-OBJS-y += block-vhd.o
+BLK-OBJS-y += block-log.o
+BLK-OBJS-y += block-qcow.o
+BLK-OBJS-y += aes.o
+BLK-OBJS-y += md5.o
+BLK-OBJS-y += $(PORTABLE-OBJS-y)
+BLK-OBJS-y += $(REMUS-OBJS)
+
+all: $(IBIN) lock-util qcow-util
+
+
+tapdisk2: $(TAP-OBJS-y) $(BLK-OBJS-y) $(MISC-OBJS-y) tapdisk2.o
+ $(CC) -o $@ $^ $(LDFLAGS) -lrt -lz $(VHDLIBS) $(AIOLIBS) $(MEMSHRLIBS) -lm $(APPEND_LDFLAGS)
+
+tapdisk-client: tapdisk-client.o
+ $(CC) -o $@ $^ $(LDFLAGS) -lrt $(APPEND_LDFLAGS)
+
+tapdisk-stream tapdisk-diff: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y)
+ $(CC) -o $@ $^ $(LDFLAGS) -lrt -lz $(VHDLIBS) $(AIOLIBS) $(MEMSHRLIBS) -lm $(APPEND_LDFLAGS)
+
+td-util: td.o tapdisk-utils.o tapdisk-log.o $(PORTABLE-OBJS-y)
+ $(CC) -o $@ $^ $(LDFLAGS) $(VHDLIBS) $(APPEND_LDFLAGS)
+
+lock-util: lock.c
+ $(CC) $(CFLAGS) -DUTIL -o lock-util lock.c $(LDFLAGS) $(APPEND_LDFLAGS)
+
+.PHONY: qcow-util
+qcow-util: img2qcow qcow2raw qcow-create
+
+img2qcow qcow2raw qcow-create: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y)
+ $(CC) -o $@ $^ $(LDFLAGS) -lrt -lz $(VHDLIBS) $(AIOLIBS) $(MEMSHRLIBS) -lm $(APPEND_LDFLAGS)
+
+install: all
+ $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+ $(INSTALL_PROG) $(IBIN) $(LOCK_UTIL) $(QCOW_UTIL) $(DESTDIR)$(INST_DIR)
+
+clean:
+ rm -rf .*.d *.o *~ xen TAGS $(IBIN) $(LIB) $(LOCK_UTIL) $(QCOW_UTIL)
+
+distclean: clean
+
+.PHONY: clean install distclean
--- /dev/null
+/**\r
+ * \r
+ * aes.c - integrated in QEMU by Fabrice Bellard from the OpenSSL project.\r
+ */\r
+/*\r
+ * rijndael-alg-fst.c\r
+ *\r
+ * @version 3.0 (December 2000)\r
+ *\r
+ * Optimised ANSI C code for the Rijndael cipher (now AES)\r
+ *\r
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>\r
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>\r
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>\r
+ *\r
+ * This code is hereby placed in the public domain.\r
+ *\r
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS\r
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\r
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE\r
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\r
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\r
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR\r
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\r
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE\r
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,\r
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+ */\r
+//#include "vl.h"\r
+#include <inttypes.h>\r
+#include <string.h>\r
+#include "aes.h"\r
+\r
+//#define NDEBUG\r
+#include <assert.h>\r
+\r
+typedef uint32_t u32;\r
+typedef uint16_t u16;\r
+typedef uint8_t u8;\r
+\r
+#define MAXKC (256/32)\r
+#define MAXKB (256/8)\r
+#define MAXNR 14\r
+\r
+/* This controls loop-unrolling in aes_core.c */\r
+#undef FULL_UNROLL\r
+# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))\r
+# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }\r
+\r
+/*\r
+Te0[x] = S [x].[02, 01, 01, 03];\r
+Te1[x] = S [x].[03, 02, 01, 01];\r
+Te2[x] = S [x].[01, 03, 02, 01];\r
+Te3[x] = S [x].[01, 01, 03, 02];\r
+Te4[x] = S [x].[01, 01, 01, 01];\r
+\r
+Td0[x] = Si[x].[0e, 09, 0d, 0b];\r
+Td1[x] = Si[x].[0b, 0e, 09, 0d];\r
+Td2[x] = Si[x].[0d, 0b, 0e, 09];\r
+Td3[x] = Si[x].[09, 0d, 0b, 0e];\r
+Td4[x] = Si[x].[01, 01, 01, 01];\r
+*/\r
+\r
+static const u32 Te0[256] = {\r
+ 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,\r
+ 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,\r
+ 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,\r
+ 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,\r
+ 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,\r
+ 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,\r
+ 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,\r
+ 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,\r
+ 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,\r
+ 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,\r
+ 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,\r
+ 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,\r
+ 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,\r
+ 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,\r
+ 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,\r
+ 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,\r
+ 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,\r
+ 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,\r
+ 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,\r
+ 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,\r
+ 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,\r
+ 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,\r
+ 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,\r
+ 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,\r
+ 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,\r
+ 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,\r
+ 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,\r
+ 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,\r
+ 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,\r
+ 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,\r
+ 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,\r
+ 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,\r
+ 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,\r
+ 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,\r
+ 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,\r
+ 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,\r
+ 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,\r
+ 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,\r
+ 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,\r
+ 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,\r
+ 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,\r
+ 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,\r
+ 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,\r
+ 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,\r
+ 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,\r
+ 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,\r
+ 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,\r
+ 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,\r
+ 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,\r
+ 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,\r
+ 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,\r
+ 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,\r
+ 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,\r
+ 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,\r
+ 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,\r
+ 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,\r
+ 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,\r
+ 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,\r
+ 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,\r
+ 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,\r
+ 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,\r
+ 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,\r
+ 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,\r
+ 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,\r
+};\r
+static const u32 Te1[256] = {\r
+ 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,\r
+ 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,\r
+ 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,\r
+ 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,\r
+ 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,\r
+ 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,\r
+ 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,\r
+ 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,\r
+ 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,\r
+ 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,\r
+ 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,\r
+ 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,\r
+ 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,\r
+ 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,\r
+ 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,\r
+ 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,\r
+ 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,\r
+ 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,\r
+ 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,\r
+ 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,\r
+ 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,\r
+ 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,\r
+ 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,\r
+ 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,\r
+ 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,\r
+ 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,\r
+ 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,\r
+ 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,\r
+ 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,\r
+ 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,\r
+ 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,\r
+ 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,\r
+ 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,\r
+ 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,\r
+ 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,\r
+ 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,\r
+ 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,\r
+ 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,\r
+ 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,\r
+ 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,\r
+ 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,\r
+ 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,\r
+ 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,\r
+ 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,\r
+ 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,\r
+ 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,\r
+ 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,\r
+ 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,\r
+ 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,\r
+ 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,\r
+ 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,\r
+ 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,\r
+ 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,\r
+ 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,\r
+ 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,\r
+ 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,\r
+ 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,\r
+ 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,\r
+ 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,\r
+ 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,\r
+ 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,\r
+ 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,\r
+ 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,\r
+ 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,\r
+};\r
+static const u32 Te2[256] = {\r
+ 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,\r
+ 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,\r
+ 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,\r
+ 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,\r
+ 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,\r
+ 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,\r
+ 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,\r
+ 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,\r
+ 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,\r
+ 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,\r
+ 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,\r
+ 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,\r
+ 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,\r
+ 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,\r
+ 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,\r
+ 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,\r
+ 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,\r
+ 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,\r
+ 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,\r
+ 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,\r
+ 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,\r
+ 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,\r
+ 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,\r
+ 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,\r
+ 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,\r
+ 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,\r
+ 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,\r
+ 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,\r
+ 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,\r
+ 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,\r
+ 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,\r
+ 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,\r
+ 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,\r
+ 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,\r
+ 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,\r
+ 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,\r
+ 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,\r
+ 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,\r
+ 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,\r
+ 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,\r
+ 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,\r
+ 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,\r
+ 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,\r
+ 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,\r
+ 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,\r
+ 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,\r
+ 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,\r
+ 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,\r
+ 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,\r
+ 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,\r
+ 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,\r
+ 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,\r
+ 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,\r
+ 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,\r
+ 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,\r
+ 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,\r
+ 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,\r
+ 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,\r
+ 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,\r
+ 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,\r
+ 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,\r
+ 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,\r
+ 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,\r
+ 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,\r
+};\r
+static const u32 Te3[256] = {\r
+\r
+ 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,\r
+ 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,\r
+ 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,\r
+ 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,\r
+ 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,\r
+ 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,\r
+ 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,\r
+ 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,\r
+ 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,\r
+ 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,\r
+ 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,\r
+ 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,\r
+ 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,\r
+ 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,\r
+ 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,\r
+ 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,\r
+ 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,\r
+ 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,\r
+ 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,\r
+ 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,\r
+ 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,\r
+ 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,\r
+ 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,\r
+ 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,\r
+ 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,\r
+ 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,\r
+ 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,\r
+ 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,\r
+ 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,\r
+ 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,\r
+ 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,\r
+ 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,\r
+ 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,\r
+ 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,\r
+ 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,\r
+ 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,\r
+ 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,\r
+ 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,\r
+ 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,\r
+ 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,\r
+ 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,\r
+ 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,\r
+ 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,\r
+ 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,\r
+ 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,\r
+ 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,\r
+ 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,\r
+ 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,\r
+ 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,\r
+ 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,\r
+ 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,\r
+ 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,\r
+ 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,\r
+ 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,\r
+ 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,\r
+ 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,\r
+ 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,\r
+ 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,\r
+ 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,\r
+ 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,\r
+ 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,\r
+ 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,\r
+ 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,\r
+ 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,\r
+};\r
+static const u32 Te4[256] = {\r
+ 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,\r
+ 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,\r
+ 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,\r
+ 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,\r
+ 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,\r
+ 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,\r
+ 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,\r
+ 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,\r
+ 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,\r
+ 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,\r
+ 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,\r
+ 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,\r
+ 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,\r
+ 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,\r
+ 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,\r
+ 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,\r
+ 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,\r
+ 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,\r
+ 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,\r
+ 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,\r
+ 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,\r
+ 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,\r
+ 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,\r
+ 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,\r
+ 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,\r
+ 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,\r
+ 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,\r
+ 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,\r
+ 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,\r
+ 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,\r
+ 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,\r
+ 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,\r
+ 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,\r
+ 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,\r
+ 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,\r
+ 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,\r
+ 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,\r
+ 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,\r
+ 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,\r
+ 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,\r
+ 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,\r
+ 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,\r
+ 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,\r
+ 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,\r
+ 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,\r
+ 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,\r
+ 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,\r
+ 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,\r
+ 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,\r
+ 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,\r
+ 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,\r
+ 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,\r
+ 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,\r
+ 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,\r
+ 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,\r
+ 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,\r
+ 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,\r
+ 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,\r
+ 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,\r
+ 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,\r
+ 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,\r
+ 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,\r
+ 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,\r
+ 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,\r
+};\r
+static const u32 Td0[256] = {\r
+ 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,\r
+ 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,\r
+ 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,\r
+ 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,\r
+ 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,\r
+ 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,\r
+ 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,\r
+ 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,\r
+ 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,\r
+ 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,\r
+ 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,\r
+ 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,\r
+ 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,\r
+ 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,\r
+ 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,\r
+ 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,\r
+ 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,\r
+ 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,\r
+ 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,\r
+ 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,\r
+ 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,\r
+ 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,\r
+ 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,\r
+ 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,\r
+ 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,\r
+ 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,\r
+ 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,\r
+ 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,\r
+ 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,\r
+ 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,\r
+ 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,\r
+ 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,\r
+ 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,\r
+ 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,\r
+ 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,\r
+ 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,\r
+ 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,\r
+ 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,\r
+ 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,\r
+ 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,\r
+ 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,\r
+ 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,\r
+ 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,\r
+ 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,\r
+ 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,\r
+ 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,\r
+ 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,\r
+ 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,\r
+ 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,\r
+ 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,\r
+ 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,\r
+ 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,\r
+ 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,\r
+ 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,\r
+ 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,\r
+ 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,\r
+ 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,\r
+ 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,\r
+ 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,\r
+ 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,\r
+ 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,\r
+ 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,\r
+ 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,\r
+ 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,\r
+};\r
+static const u32 Td1[256] = {\r
+ 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,\r
+ 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,\r
+ 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,\r
+ 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,\r
+ 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,\r
+ 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,\r
+ 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,\r
+ 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,\r
+ 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,\r
+ 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,\r
+ 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,\r
+ 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,\r
+ 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,\r
+ 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,\r
+ 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,\r
+ 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,\r
+ 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,\r
+ 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,\r
+ 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,\r
+ 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,\r
+ 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,\r
+ 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,\r
+ 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,\r
+ 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,\r
+ 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,\r
+ 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,\r
+ 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,\r
+ 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,\r
+ 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,\r
+ 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,\r
+ 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,\r
+ 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,\r
+ 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,\r
+ 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,\r
+ 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,\r
+ 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,\r
+ 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,\r
+ 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,\r
+ 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,\r
+ 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,\r
+ 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,\r
+ 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,\r
+ 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,\r
+ 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,\r
+ 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,\r
+ 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,\r
+ 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,\r
+ 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,\r
+ 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,\r
+ 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,\r
+ 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,\r
+ 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,\r
+ 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,\r
+ 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,\r
+ 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,\r
+ 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,\r
+ 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,\r
+ 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,\r
+ 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,\r
+ 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,\r
+ 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,\r
+ 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,\r
+ 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,\r
+ 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,\r
+};\r
+static const u32 Td2[256] = {\r
+ 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,\r
+ 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,\r
+ 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,\r
+ 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,\r
+ 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,\r
+ 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,\r
+ 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,\r
+ 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,\r
+ 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,\r
+ 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,\r
+ 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,\r
+ 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,\r
+ 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,\r
+ 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,\r
+ 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,\r
+ 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,\r
+ 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,\r
+ 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,\r
+ 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,\r
+ 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,\r
+\r
+ 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,\r
+ 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,\r
+ 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,\r
+ 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,\r
+ 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,\r
+ 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,\r
+ 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,\r
+ 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,\r
+ 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,\r
+ 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,\r
+ 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,\r
+ 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,\r
+ 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,\r
+ 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,\r
+ 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,\r
+ 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,\r
+ 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,\r
+ 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,\r
+ 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,\r
+ 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,\r
+ 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,\r
+ 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,\r
+ 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,\r
+ 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,\r
+ 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,\r
+ 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,\r
+ 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,\r
+ 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,\r
+ 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,\r
+ 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,\r
+ 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,\r
+ 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,\r
+ 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,\r
+ 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,\r
+ 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,\r
+ 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,\r
+ 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,\r
+ 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,\r
+ 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,\r
+ 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,\r
+ 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,\r
+ 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,\r
+ 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,\r
+ 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,\r
+};\r
+static const u32 Td3[256] = {\r
+ 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,\r
+ 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,\r
+ 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,\r
+ 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,\r
+ 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,\r
+ 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,\r
+ 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,\r
+ 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,\r
+ 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,\r
+ 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,\r
+ 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,\r
+ 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,\r
+ 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,\r
+ 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,\r
+ 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,\r
+ 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,\r
+ 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,\r
+ 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,\r
+ 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,\r
+ 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,\r
+ 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,\r
+ 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,\r
+ 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,\r
+ 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,\r
+ 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,\r
+ 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,\r
+ 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,\r
+ 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,\r
+ 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,\r
+ 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,\r
+ 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,\r
+ 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,\r
+ 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,\r
+ 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,\r
+ 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,\r
+ 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,\r
+ 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,\r
+ 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,\r
+ 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,\r
+ 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,\r
+ 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,\r
+ 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,\r
+ 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,\r
+ 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,\r
+ 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,\r
+ 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,\r
+ 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,\r
+ 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,\r
+ 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,\r
+ 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,\r
+ 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,\r
+ 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,\r
+ 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,\r
+ 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,\r
+ 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,\r
+ 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,\r
+ 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,\r
+ 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,\r
+ 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,\r
+ 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,\r
+ 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,\r
+ 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,\r
+ 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,\r
+ 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,\r
+};\r
+static const u32 Td4[256] = {\r
+ 0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,\r
+ 0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,\r
+ 0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,\r
+ 0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,\r
+ 0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,\r
+ 0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,\r
+ 0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,\r
+ 0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,\r
+ 0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,\r
+ 0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,\r
+ 0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,\r
+ 0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,\r
+ 0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,\r
+ 0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,\r
+ 0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,\r
+ 0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,\r
+ 0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,\r
+ 0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,\r
+ 0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,\r
+ 0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,\r
+ 0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,\r
+ 0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,\r
+ 0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,\r
+ 0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,\r
+ 0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,\r
+ 0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,\r
+ 0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,\r
+ 0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,\r
+ 0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,\r
+ 0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,\r
+ 0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,\r
+ 0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,\r
+ 0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,\r
+ 0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,\r
+ 0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,\r
+ 0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,\r
+ 0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,\r
+ 0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,\r
+ 0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,\r
+ 0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,\r
+ 0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,\r
+ 0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,\r
+ 0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,\r
+ 0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,\r
+ 0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,\r
+ 0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,\r
+ 0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,\r
+ 0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,\r
+ 0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,\r
+ 0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,\r
+ 0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,\r
+ 0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,\r
+ 0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,\r
+ 0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,\r
+ 0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,\r
+ 0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,\r
+ 0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,\r
+ 0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,\r
+ 0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,\r
+ 0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,\r
+ 0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,\r
+ 0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,\r
+ 0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,\r
+ 0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU,\r
+};\r
+static const u32 rcon[] = {\r
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000,\r
+ 0x10000000, 0x20000000, 0x40000000, 0x80000000,\r
+ 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */\r
+};\r
+\r
+/**\r
+ * Expand the cipher key into the encryption key schedule.\r
+ */\r
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,\r
+ AES_KEY *key) {\r
+\r
+ u32 *rk;\r
+ int i = 0;\r
+ u32 temp;\r
+\r
+ if (!userKey || !key)\r
+ return -1;\r
+ if (bits != 128 && bits != 192 && bits != 256)\r
+ return -2;\r
+\r
+ rk = key->rd_key;\r
+\r
+ if (bits==128)\r
+ key->rounds = 10;\r
+ else if (bits==192)\r
+ key->rounds = 12;\r
+ else\r
+ key->rounds = 14;\r
+\r
+ rk[0] = GETU32(userKey );\r
+ rk[1] = GETU32(userKey + 4);\r
+ rk[2] = GETU32(userKey + 8);\r
+ rk[3] = GETU32(userKey + 12);\r
+ if (bits == 128) {\r
+ while (1) {\r
+ temp = rk[3];\r
+ rk[4] = rk[0] ^\r
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^\r
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^\r
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^\r
+ (Te4[(temp >> 24) ] & 0x000000ff) ^\r
+ rcon[i];\r
+ rk[5] = rk[1] ^ rk[4];\r
+ rk[6] = rk[2] ^ rk[5];\r
+ rk[7] = rk[3] ^ rk[6];\r
+ if (++i == 10) {\r
+ return 0;\r
+ }\r
+ rk += 4;\r
+ }\r
+ }\r
+ rk[4] = GETU32(userKey + 16);\r
+ rk[5] = GETU32(userKey + 20);\r
+ if (bits == 192) {\r
+ while (1) {\r
+ temp = rk[ 5];\r
+ rk[ 6] = rk[ 0] ^\r
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^\r
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^\r
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^\r
+ (Te4[(temp >> 24) ] & 0x000000ff) ^\r
+ rcon[i];\r
+ rk[ 7] = rk[ 1] ^ rk[ 6];\r
+ rk[ 8] = rk[ 2] ^ rk[ 7];\r
+ rk[ 9] = rk[ 3] ^ rk[ 8];\r
+ if (++i == 8) {\r
+ return 0;\r
+ }\r
+ rk[10] = rk[ 4] ^ rk[ 9];\r
+ rk[11] = rk[ 5] ^ rk[10];\r
+ rk += 6;\r
+ }\r
+ }\r
+ rk[6] = GETU32(userKey + 24);\r
+ rk[7] = GETU32(userKey + 28);\r
+ if (bits == 256) {\r
+ while (1) {\r
+ temp = rk[ 7];\r
+ rk[ 8] = rk[ 0] ^\r
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^\r
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^\r
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^\r
+ (Te4[(temp >> 24) ] & 0x000000ff) ^\r
+ rcon[i];\r
+ rk[ 9] = rk[ 1] ^ rk[ 8];\r
+ rk[10] = rk[ 2] ^ rk[ 9];\r
+ rk[11] = rk[ 3] ^ rk[10];\r
+ if (++i == 7) {\r
+ return 0;\r
+ }\r
+ temp = rk[11];\r
+ rk[12] = rk[ 4] ^\r
+ (Te4[(temp >> 24) ] & 0xff000000) ^\r
+ (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^\r
+ (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^\r
+ (Te4[(temp ) & 0xff] & 0x000000ff);\r
+ rk[13] = rk[ 5] ^ rk[12];\r
+ rk[14] = rk[ 6] ^ rk[13];\r
+ rk[15] = rk[ 7] ^ rk[14];\r
+\r
+ rk += 8;\r
+ }\r
+ }\r
+ return 0;\r
+}\r
+\r
+/**\r
+ * Expand the cipher key into the decryption key schedule.\r
+ */\r
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,\r
+ AES_KEY *key) {\r
+\r
+ u32 *rk;\r
+ int i, j, status;\r
+ u32 temp;\r
+\r
+ /* first, start with an encryption schedule */\r
+ status = AES_set_encrypt_key(userKey, bits, key);\r
+ if (status < 0)\r
+ return status;\r
+\r
+ rk = key->rd_key;\r
+\r
+ /* invert the order of the round keys: */\r
+ for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {\r
+ temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;\r
+ temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;\r
+ temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;\r
+ temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;\r
+ }\r
+ /* apply the inverse MixColumn transform to all round keys but the first and the last: */\r
+ for (i = 1; i < (key->rounds); i++) {\r
+ rk += 4;\r
+ rk[0] =\r
+ Td0[Te4[(rk[0] >> 24) ] & 0xff] ^\r
+ Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^\r
+ Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^\r
+ Td3[Te4[(rk[0] ) & 0xff] & 0xff];\r
+ rk[1] =\r
+ Td0[Te4[(rk[1] >> 24) ] & 0xff] ^\r
+ Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^\r
+ Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^\r
+ Td3[Te4[(rk[1] ) & 0xff] & 0xff];\r
+ rk[2] =\r
+ Td0[Te4[(rk[2] >> 24) ] & 0xff] ^\r
+ Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^\r
+ Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^\r
+ Td3[Te4[(rk[2] ) & 0xff] & 0xff];\r
+ rk[3] =\r
+ Td0[Te4[(rk[3] >> 24) ] & 0xff] ^\r
+ Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^\r
+ Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^\r
+ Td3[Te4[(rk[3] ) & 0xff] & 0xff];\r
+ }\r
+ return 0;\r
+}\r
+\r
+#ifndef AES_ASM\r
+/*\r
+ * Encrypt a single block\r
+ * in and out can overlap\r
+ */\r
+void AES_encrypt(const unsigned char *in, unsigned char *out,\r
+ const AES_KEY *key) {\r
+\r
+ const u32 *rk;\r
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;\r
+#ifndef FULL_UNROLL\r
+ int r;\r
+#endif /* ?FULL_UNROLL */\r
+\r
+ assert(in && out && key);\r
+ rk = key->rd_key;\r
+\r
+ /*\r
+ * map byte array block to cipher state\r
+ * and add initial round key:\r
+ */\r
+ s0 = GETU32(in ) ^ rk[0];\r
+ s1 = GETU32(in + 4) ^ rk[1];\r
+ s2 = GETU32(in + 8) ^ rk[2];\r
+ s3 = GETU32(in + 12) ^ rk[3];\r
+#ifdef FULL_UNROLL\r
+ /* round 1: */\r
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];\r
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];\r
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];\r
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];\r
+ /* round 2: */\r
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];\r
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];\r
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];\r
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];\r
+ /* round 3: */\r
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];\r
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];\r
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];\r
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];\r
+ /* round 4: */\r
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];\r
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];\r
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];\r
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];\r
+ /* round 5: */\r
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];\r
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];\r
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];\r
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];\r
+ /* round 6: */\r
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];\r
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];\r
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];\r
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];\r
+ /* round 7: */\r
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];\r
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];\r
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];\r
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];\r
+ /* round 8: */\r
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];\r
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];\r
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];\r
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];\r
+ /* round 9: */\r
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];\r
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];\r
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];\r
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];\r
+ if (key->rounds > 10) {\r
+ /* round 10: */\r
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];\r
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];\r
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];\r
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];\r
+ /* round 11: */\r
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];\r
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];\r
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];\r
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];\r
+ if (key->rounds > 12) {\r
+ /* round 12: */\r
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];\r
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];\r
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];\r
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];\r
+ /* round 13: */\r
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];\r
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];\r
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];\r
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];\r
+ }\r
+ }\r
+ rk += key->rounds << 2;\r
+#else /* !FULL_UNROLL */\r
+ /*\r
+ * Nr - 1 full rounds:\r
+ */\r
+ r = key->rounds >> 1;\r
+ for (;;) {\r
+ t0 =\r
+ Te0[(s0 >> 24) ] ^\r
+ Te1[(s1 >> 16) & 0xff] ^\r
+ Te2[(s2 >> 8) & 0xff] ^\r
+ Te3[(s3 ) & 0xff] ^\r
+ rk[4];\r
+ t1 =\r
+ Te0[(s1 >> 24) ] ^\r
+ Te1[(s2 >> 16) & 0xff] ^\r
+ Te2[(s3 >> 8) & 0xff] ^\r
+ Te3[(s0 ) & 0xff] ^\r
+ rk[5];\r
+ t2 =\r
+ Te0[(s2 >> 24) ] ^\r
+ Te1[(s3 >> 16) & 0xff] ^\r
+ Te2[(s0 >> 8) & 0xff] ^\r
+ Te3[(s1 ) & 0xff] ^\r
+ rk[6];\r
+ t3 =\r
+ Te0[(s3 >> 24) ] ^\r
+ Te1[(s0 >> 16) & 0xff] ^\r
+ Te2[(s1 >> 8) & 0xff] ^\r
+ Te3[(s2 ) & 0xff] ^\r
+ rk[7];\r
+\r
+ rk += 8;\r
+ if (--r == 0) {\r
+ break;\r
+ }\r
+\r
+ s0 =\r
+ Te0[(t0 >> 24) ] ^\r
+ Te1[(t1 >> 16) & 0xff] ^\r
+ Te2[(t2 >> 8) & 0xff] ^\r
+ Te3[(t3 ) & 0xff] ^\r
+ rk[0];\r
+ s1 =\r
+ Te0[(t1 >> 24) ] ^\r
+ Te1[(t2 >> 16) & 0xff] ^\r
+ Te2[(t3 >> 8) & 0xff] ^\r
+ Te3[(t0 ) & 0xff] ^\r
+ rk[1];\r
+ s2 =\r
+ Te0[(t2 >> 24) ] ^\r
+ Te1[(t3 >> 16) & 0xff] ^\r
+ Te2[(t0 >> 8) & 0xff] ^\r
+ Te3[(t1 ) & 0xff] ^\r
+ rk[2];\r
+ s3 =\r
+ Te0[(t3 >> 24) ] ^\r
+ Te1[(t0 >> 16) & 0xff] ^\r
+ Te2[(t1 >> 8) & 0xff] ^\r
+ Te3[(t2 ) & 0xff] ^\r
+ rk[3];\r
+ }\r
+#endif /* ?FULL_UNROLL */\r
+ /*\r
+ * apply last round and\r
+ * map cipher state to byte array block:\r
+ */\r
+ s0 =\r
+ (Te4[(t0 >> 24) ] & 0xff000000) ^\r
+ (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^\r
+ (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^\r
+ (Te4[(t3 ) & 0xff] & 0x000000ff) ^\r
+ rk[0];\r
+ PUTU32(out , s0);\r
+ s1 =\r
+ (Te4[(t1 >> 24) ] & 0xff000000) ^\r
+ (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^\r
+ (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^\r
+ (Te4[(t0 ) & 0xff] & 0x000000ff) ^\r
+ rk[1];\r
+ PUTU32(out + 4, s1);\r
+ s2 =\r
+ (Te4[(t2 >> 24) ] & 0xff000000) ^\r
+ (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^\r
+ (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^\r
+ (Te4[(t1 ) & 0xff] & 0x000000ff) ^\r
+ rk[2];\r
+ PUTU32(out + 8, s2);\r
+ s3 =\r
+ (Te4[(t3 >> 24) ] & 0xff000000) ^\r
+ (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^\r
+ (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^\r
+ (Te4[(t2 ) & 0xff] & 0x000000ff) ^\r
+ rk[3];\r
+ PUTU32(out + 12, s3);\r
+}\r
+\r
+/*\r
+ * Decrypt a single block\r
+ * in and out can overlap\r
+ */\r
+void AES_decrypt(const unsigned char *in, unsigned char *out,\r
+ const AES_KEY *key) {\r
+\r
+ const u32 *rk;\r
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;\r
+#ifndef FULL_UNROLL\r
+ int r;\r
+#endif /* ?FULL_UNROLL */\r
+\r
+ assert(in && out && key);\r
+ rk = key->rd_key;\r
+\r
+ /*\r
+ * map byte array block to cipher state\r
+ * and add initial round key:\r
+ */\r
+ s0 = GETU32(in ) ^ rk[0];\r
+ s1 = GETU32(in + 4) ^ rk[1];\r
+ s2 = GETU32(in + 8) ^ rk[2];\r
+ s3 = GETU32(in + 12) ^ rk[3];\r
+#ifdef FULL_UNROLL\r
+ /* round 1: */\r
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];\r
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];\r
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];\r
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];\r
+ /* round 2: */\r
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];\r
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];\r
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];\r
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];\r
+ /* round 3: */\r
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];\r
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];\r
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];\r
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];\r
+ /* round 4: */\r
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];\r
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];\r
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];\r
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];\r
+ /* round 5: */\r
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];\r
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];\r
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];\r
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];\r
+ /* round 6: */\r
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];\r
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];\r
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];\r
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];\r
+ /* round 7: */\r
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];\r
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];\r
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];\r
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];\r
+ /* round 8: */\r
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];\r
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];\r
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];\r
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];\r
+ /* round 9: */\r
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];\r
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];\r
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];\r
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];\r
+ if (key->rounds > 10) {\r
+ /* round 10: */\r
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];\r
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];\r
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];\r
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];\r
+ /* round 11: */\r
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];\r
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];\r
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];\r
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];\r
+ if (key->rounds > 12) {\r
+ /* round 12: */\r
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];\r
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];\r
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];\r
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];\r
+ /* round 13: */\r
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];\r
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];\r
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];\r
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];\r
+ }\r
+ }\r
+ rk += key->rounds << 2;\r
+#else /* !FULL_UNROLL */\r
+ /*\r
+ * Nr - 1 full rounds:\r
+ */\r
+ r = key->rounds >> 1;\r
+ for (;;) {\r
+ t0 =\r
+ Td0[(s0 >> 24) ] ^\r
+ Td1[(s3 >> 16) & 0xff] ^\r
+ Td2[(s2 >> 8) & 0xff] ^\r
+ Td3[(s1 ) & 0xff] ^\r
+ rk[4];\r
+ t1 =\r
+ Td0[(s1 >> 24) ] ^\r
+ Td1[(s0 >> 16) & 0xff] ^\r
+ Td2[(s3 >> 8) & 0xff] ^\r
+ Td3[(s2 ) & 0xff] ^\r
+ rk[5];\r
+ t2 =\r
+ Td0[(s2 >> 24) ] ^\r
+ Td1[(s1 >> 16) & 0xff] ^\r
+ Td2[(s0 >> 8) & 0xff] ^\r
+ Td3[(s3 ) & 0xff] ^\r
+ rk[6];\r
+ t3 =\r
+ Td0[(s3 >> 24) ] ^\r
+ Td1[(s2 >> 16) & 0xff] ^\r
+ Td2[(s1 >> 8) & 0xff] ^\r
+ Td3[(s0 ) & 0xff] ^\r
+ rk[7];\r
+\r
+ rk += 8;\r
+ if (--r == 0) {\r
+ break;\r
+ }\r
+\r
+ s0 =\r
+ Td0[(t0 >> 24) ] ^\r
+ Td1[(t3 >> 16) & 0xff] ^\r
+ Td2[(t2 >> 8) & 0xff] ^\r
+ Td3[(t1 ) & 0xff] ^\r
+ rk[0];\r
+ s1 =\r
+ Td0[(t1 >> 24) ] ^\r
+ Td1[(t0 >> 16) & 0xff] ^\r
+ Td2[(t3 >> 8) & 0xff] ^\r
+ Td3[(t2 ) & 0xff] ^\r
+ rk[1];\r
+ s2 =\r
+ Td0[(t2 >> 24) ] ^\r
+ Td1[(t1 >> 16) & 0xff] ^\r
+ Td2[(t0 >> 8) & 0xff] ^\r
+ Td3[(t3 ) & 0xff] ^\r
+ rk[2];\r
+ s3 =\r
+ Td0[(t3 >> 24) ] ^\r
+ Td1[(t2 >> 16) & 0xff] ^\r
+ Td2[(t1 >> 8) & 0xff] ^\r
+ Td3[(t0 ) & 0xff] ^\r
+ rk[3];\r
+ }\r
+#endif /* ?FULL_UNROLL */\r
+ /*\r
+ * apply last round and\r
+ * map cipher state to byte array block:\r
+ */\r
+ s0 =\r
+ (Td4[(t0 >> 24) ] & 0xff000000) ^\r
+ (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^\r
+ (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^\r
+ (Td4[(t1 ) & 0xff] & 0x000000ff) ^\r
+ rk[0];\r
+ PUTU32(out , s0);\r
+ s1 =\r
+ (Td4[(t1 >> 24) ] & 0xff000000) ^\r
+ (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^\r
+ (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^\r
+ (Td4[(t2 ) & 0xff] & 0x000000ff) ^\r
+ rk[1];\r
+ PUTU32(out + 4, s1);\r
+ s2 =\r
+ (Td4[(t2 >> 24) ] & 0xff000000) ^\r
+ (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^\r
+ (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^\r
+ (Td4[(t3 ) & 0xff] & 0x000000ff) ^\r
+ rk[2];\r
+ PUTU32(out + 8, s2);\r
+ s3 =\r
+ (Td4[(t3 >> 24) ] & 0xff000000) ^\r
+ (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^\r
+ (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^\r
+ (Td4[(t0 ) & 0xff] & 0x000000ff) ^\r
+ rk[3];\r
+ PUTU32(out + 12, s3);\r
+}\r
+\r
+#endif /* AES_ASM */\r
+\r
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,\r
+ const unsigned long length, const AES_KEY *key,\r
+ unsigned char *ivec, const int enc) \r
+{\r
+\r
+ unsigned long n;\r
+ unsigned long len = length;\r
+ unsigned char tmp[AES_BLOCK_SIZE];\r
+\r
+ assert(in && out && key && ivec);\r
+\r
+ if (enc) {\r
+ while (len >= AES_BLOCK_SIZE) {\r
+ for(n=0; n < AES_BLOCK_SIZE; ++n)\r
+ tmp[n] = in[n] ^ ivec[n];\r
+ AES_encrypt(tmp, out, key);\r
+ memcpy(ivec, out, AES_BLOCK_SIZE);\r
+ len -= AES_BLOCK_SIZE;\r
+ in += AES_BLOCK_SIZE;\r
+ out += AES_BLOCK_SIZE;\r
+ }\r
+ if (len) {\r
+ for(n=0; n < len; ++n)\r
+ tmp[n] = in[n] ^ ivec[n];\r
+ for(n=len; n < AES_BLOCK_SIZE; ++n)\r
+ tmp[n] = ivec[n];\r
+ AES_encrypt(tmp, tmp, key);\r
+ memcpy(out, tmp, AES_BLOCK_SIZE);\r
+ memcpy(ivec, tmp, AES_BLOCK_SIZE);\r
+ } \r
+ } else {\r
+ while (len >= AES_BLOCK_SIZE) {\r
+ memcpy(tmp, in, AES_BLOCK_SIZE);\r
+ AES_decrypt(in, out, key);\r
+ for(n=0; n < AES_BLOCK_SIZE; ++n)\r
+ out[n] ^= ivec[n];\r
+ memcpy(ivec, tmp, AES_BLOCK_SIZE);\r
+ len -= AES_BLOCK_SIZE;\r
+ in += AES_BLOCK_SIZE;\r
+ out += AES_BLOCK_SIZE;\r
+ }\r
+ if (len) {\r
+ memcpy(tmp, in, AES_BLOCK_SIZE);\r
+ AES_decrypt(tmp, tmp, key);\r
+ for(n=0; n < len; ++n)\r
+ out[n] = tmp[n] ^ ivec[n];\r
+ memcpy(ivec, tmp, AES_BLOCK_SIZE);\r
+ } \r
+ }\r
+}\r
--- /dev/null
+#ifndef QEMU_AES_H
+#define QEMU_AES_H
+
+#include <stdint.h>
+
+#define AES_MAXNR 14
+#define AES_BLOCK_SIZE 16
+
+struct aes_key_st {
+ uint32_t rd_key[4 *(AES_MAXNR + 1)];
+ int rounds;
+};
+typedef struct aes_key_st AES_KEY;
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ const unsigned long length, const AES_KEY *key,
+ unsigned char *ivec, const int enc);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved.
+ * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t
+atomicio(f, fd, _s, n)
+ ssize_t (*f) (int, void *, size_t);
+ int fd;
+ void *_s;
+ size_t n;
+{
+ char *s = _s;
+ size_t pos = 0;
+ ssize_t res;
+
+ while (n > pos) {
+ res = (f) (fd, s + pos, n - pos);
+ switch (res) {
+ case -1:
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ return 0;
+ case 0:
+ errno = EPIPE;
+ return pos;
+ default:
+ pos += (size_t)res;
+ }
+ }
+ return (pos);
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <inttypes.h>
+
+int blk_getimagesize(int fd, uint64_t *size);
+int blk_getsectorsize(int fd, uint64_t *sector_size);
+
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
--- /dev/null
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include "tapdisk.h"
+#include "blk.h"
+
+int blk_getimagesize(int fd, uint64_t *size)
+{
+ int rc;
+
+ *size = 0;
+ rc = ioctl(fd, BLKGETSIZE, size);
+ if (rc) {
+ DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int blk_getsectorsize(int fd, uint64_t *sector_size)
+{
+#if defined(BLKSSZGET)
+ int rc;
+
+ *sector_size = DEFAULT_SECTOR_SIZE;
+ rc = ioctl(fd, BLKSSZGET, sector_size);
+ if (rc) {
+ DPRINTF("ERR: BLKSSZGET failed. Falling back to use default sector size");
+ *sector_size = DEFAULT_SECTOR_SIZE;
+ }
+
+ if (*sector_size != DEFAULT_SECTOR_SIZE)
+ DPRINTF("Note: sector size is %"PRIu64" (not %u)\n",
+ *sector_size, DEFAULT_SECTOR_SIZE);
+#else
+ *sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+ return 0;
+}
+
--- /dev/null
+#include <sys/param.h>
+#include <sys/ioctl.h>
+#include <sys/disklabel.h>
+#include <errno.h>
+#include <inttypes.h>
+#include "tapdisk.h"
+#include "blk.h"
+
+int blk_getimagesize(int fd, uint64_t *size)
+{
+ int rc;
+ struct disklabel dl;
+
+ *size = 0;
+ rc = ioctl(fd, DIOCGDINFO, &dl);
+ if (rc) {
+ DPRINTF("ERR: DIOCGDINFO failed, couldn't stat image");
+ return -EINVAL;
+ }
+
+ *size = dl.d_secsize * dl.d_secpercyl;
+
+ return 0;
+}
+
+int blk_getsectorsize(int fd, uint64_t *sector_size)
+{
+ int rc;
+ struct disklabel dl;
+
+ *sector_size = DEV_BSIZE;
+ rc = ioctl(fd, DIOCGDINFO, &dl);
+ if (rc) {
+ DPRINTF("ERR: DIOCGDINFO failed, couldn't stat image");
+ return 0; /* fallback to DEV_BSIZE */
+ }
+
+ *sector_size = dl.d_secsize;
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#include "blk.h"
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_AIO_REQS TAPDISK_DATA_REQUESTS
+
+struct tdaio_state;
+
+struct aio_request {
+ td_request_t treq;
+ struct tiocb tiocb;
+ struct tdaio_state *state;
+};
+
+struct tdaio_state {
+ int fd;
+ td_driver_t *driver;
+
+ int aio_free_count;
+ struct aio_request aio_requests[MAX_AIO_REQS];
+ struct aio_request *aio_free_list[MAX_AIO_REQS];
+};
+
+/*Get Image size, secsize*/
+static int tdaio_get_image_info(int fd, td_disk_info_t *info)
+{
+ int ret;
+ long size;
+ unsigned long total_size;
+ struct statvfs statBuf;
+ struct stat stat;
+
+ ret = fstat(fd, &stat);
+ if (ret != 0) {
+ DPRINTF("ERROR: fstat failed, Couldn't stat image");
+ return -EINVAL;
+ }
+
+ if (S_ISBLK(stat.st_mode)) {
+ /*Accessing block device directly*/
+ info->size = 0;
+ if (blk_getimagesize(fd, &info->size) != 0)
+ return -EINVAL;
+
+ DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
+ "sector_shift [%llu]\n",
+ (long long unsigned)(info->size << SECTOR_SHIFT),
+ (long long unsigned)info->size);
+
+ /*Get the sector size*/
+ if (blk_getsectorsize(fd, &info->sector_size) != 0)
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+
+ } else {
+ /*Local file? try fstat instead*/
+ info->size = (stat.st_size >> SECTOR_SHIFT);
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+ DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
+ "sector_shift [%llu]\n",
+ (long long unsigned)(info->size << SECTOR_SHIFT),
+ (long long unsigned)info->size);
+ }
+
+ if (info->size == 0) {
+ info->size =((uint64_t) 16836057);
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+ }
+ info->info = 0;
+
+ return 0;
+}
+
+/* Open the disk file and initialize aio state. */
+int tdaio_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+ int i, fd, ret, o_flags;
+ struct tdaio_state *prv;
+
+ ret = 0;
+ prv = (struct tdaio_state *)driver->data;
+
+ DPRINTF("block-aio open('%s')", name);
+
+ memset(prv, 0, sizeof(struct tdaio_state));
+
+ prv->aio_free_count = MAX_AIO_REQS;
+ for (i = 0; i < MAX_AIO_REQS; i++)
+ prv->aio_free_list[i] = &prv->aio_requests[i];
+
+ /* Open the file */
+ o_flags = O_DIRECT | O_LARGEFILE |
+ ((flags & TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+ fd = open(name, o_flags);
+
+ if ( (fd == -1) && (errno == EINVAL) ) {
+
+ /* Maybe O_DIRECT isn't supported. */
+ o_flags &= ~O_DIRECT;
+ fd = open(name, o_flags);
+ if (fd != -1) DPRINTF("WARNING: Accessing image without"
+ "O_DIRECT! (%s)\n", name);
+
+ } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+
+ if (fd == -1) {
+ DPRINTF("Unable to open [%s] (%d)!\n", name, 0 - errno);
+ ret = 0 - errno;
+ goto done;
+ }
+
+ ret = tdaio_get_image_info(fd, &driver->info);
+ if (ret) {
+ close(fd);
+ goto done;
+ }
+
+ prv->fd = fd;
+
+done:
+ return ret;
+}
+
+void tdaio_complete(void *arg, struct tiocb *tiocb, int err)
+{
+ struct aio_request *aio = (struct aio_request *)arg;
+ struct tdaio_state *prv = aio->state;
+
+ td_complete_request(aio->treq, err);
+ prv->aio_free_list[prv->aio_free_count++] = aio;
+}
+
+void tdaio_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ int size;
+ uint64_t offset;
+ struct aio_request *aio;
+ struct tdaio_state *prv;
+
+ prv = (struct tdaio_state *)driver->data;
+ size = treq.secs * driver->info.sector_size;
+ offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ if (prv->aio_free_count == 0)
+ goto fail;
+
+ aio = prv->aio_free_list[--prv->aio_free_count];
+ aio->treq = treq;
+ aio->state = prv;
+
+ td_prep_read(&aio->tiocb, prv->fd, treq.buf,
+ size, offset, tdaio_complete, aio);
+ td_queue_tiocb(driver, &aio->tiocb);
+
+ return;
+
+fail:
+ td_complete_request(treq, -EBUSY);
+}
+
+void tdaio_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ int size;
+ uint64_t offset;
+ struct aio_request *aio;
+ struct tdaio_state *prv;
+
+ prv = (struct tdaio_state *)driver->data;
+ size = treq.secs * driver->info.sector_size;
+ offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ if (prv->aio_free_count == 0)
+ goto fail;
+
+ aio = prv->aio_free_list[--prv->aio_free_count];
+ aio->treq = treq;
+ aio->state = prv;
+
+ td_prep_write(&aio->tiocb, prv->fd, treq.buf,
+ size, offset, tdaio_complete, aio);
+ td_queue_tiocb(driver, &aio->tiocb);
+
+ return;
+
+fail:
+ td_complete_request(treq, -EBUSY);
+}
+
+int tdaio_close(td_driver_t *driver)
+{
+ struct tdaio_state *prv = (struct tdaio_state *)driver->data;
+
+ close(prv->fd);
+
+ return 0;
+}
+
+int tdaio_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ return TD_NO_PARENT;
+}
+
+int tdaio_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ return -EINVAL;
+}
+
+struct tap_disk tapdisk_aio = {
+ .disk_type = "tapdisk_aio",
+ .flags = 0,
+ .private_data_size = sizeof(struct tdaio_state),
+ .td_open = tdaio_open,
+ .td_close = tdaio_close,
+ .td_queue_read = tdaio_queue_read,
+ .td_queue_write = tdaio_queue_write,
+ .td_get_parent_id = tdaio_get_parent_id,
+ .td_validate_parent = tdaio_validate_parent,
+ .td_debug = NULL,
+};
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "tapdisk.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+#ifdef DEBUG
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#else
+#define DBG(_f, _a...) ((void)0)
+#endif
+
+#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
+
+#define RADIX_TREE_PAGE_SHIFT 12 /* 4K pages */
+#define RADIX_TREE_PAGE_SIZE (1 << RADIX_TREE_PAGE_SHIFT)
+
+#define RADIX_TREE_NODE_SHIFT 9 /* 512B nodes */
+#define RADIX_TREE_NODE_SIZE (1 << RADIX_TREE_NODE_SHIFT)
+#define RADIX_TREE_NODE_MASK (RADIX_TREE_NODE_SIZE - 1)
+
+#define BLOCK_CACHE_NODES_PER_PAGE (1 << (RADIX_TREE_PAGE_SHIFT - RADIX_TREE_NODE_SHIFT))
+
+#define BLOCK_CACHE_MAX_SIZE (10 << 20) /* 100MB cache */
+#define BLOCK_CACHE_REQUESTS (TAPDISK_DATA_REQUESTS << 3)
+#define BLOCK_CACHE_PAGE_IDLETIME 60
+
+typedef struct radix_tree radix_tree_t;
+typedef struct radix_tree_node radix_tree_node_t;
+typedef struct radix_tree_link radix_tree_link_t;
+typedef struct radix_tree_leaf radix_tree_leaf_t;
+typedef struct radix_tree_page radix_tree_page_t;
+
+typedef struct block_cache block_cache_t;
+typedef struct block_cache_request block_cache_request_t;
+typedef struct block_cache_stats block_cache_stats_t;
+
+struct radix_tree_page {
+ char *buf;
+ size_t size;
+ uint64_t sec;
+ radix_tree_link_t *owners[BLOCK_CACHE_NODES_PER_PAGE];
+};
+
+struct radix_tree_leaf {
+ radix_tree_page_t *page;
+ char *buf;
+};
+
+struct radix_tree_link {
+ uint32_t time;
+ union {
+ radix_tree_node_t *next;
+ radix_tree_leaf_t leaf;
+ } u;
+};
+
+struct radix_tree_node {
+ int height;
+ radix_tree_link_t links[RADIX_TREE_NODE_SIZE];
+};
+
+struct radix_tree {
+ int height;
+ uint64_t size;
+ uint32_t nodes;
+ radix_tree_node_t *root;
+
+ block_cache_t *cache;
+};
+
+struct block_cache_request {
+ int err;
+ char *buf;
+ uint64_t secs;
+ td_request_t treq;
+ block_cache_t *cache;
+};
+
+struct block_cache_stats {
+ uint64_t reads;
+ uint64_t hits;
+ uint64_t misses;
+ uint64_t prunes;
+};
+
+struct block_cache {
+ int ptype;
+ char *name;
+
+ uint64_t sectors;
+
+ block_cache_request_t requests[BLOCK_CACHE_REQUESTS];
+ block_cache_request_t *request_free_list[BLOCK_CACHE_REQUESTS];
+ int requests_free;
+
+ event_id_t timeout_id;
+
+ radix_tree_t tree;
+
+ block_cache_stats_t stats;
+};
+
+static inline uint64_t
+radix_tree_calculate_size(int height)
+{
+ return (uint64_t)RADIX_TREE_NODE_SIZE <<
+ (height * RADIX_TREE_NODE_SHIFT);
+}
+
+static inline int
+radix_tree_calculate_height(uint64_t sectors)
+{
+ int height;
+ uint64_t tree_size;
+
+ height = 1; /* always allocate root node */
+ tree_size = radix_tree_calculate_size(height);
+ while (sectors > tree_size)
+ tree_size = radix_tree_calculate_size(++height);
+
+ return height;
+}
+
+static inline int
+radix_tree_index(radix_tree_node_t *node, uint64_t sector)
+{
+ return ((sector >> (node->height * RADIX_TREE_NODE_SHIFT)) &
+ RADIX_TREE_NODE_MASK);
+}
+
+static inline int
+radix_tree_node_contains_leaves(radix_tree_t *tree, radix_tree_node_t *node)
+{
+ return (node->height == 0);
+}
+
+static inline int
+radix_tree_node_is_root(radix_tree_t *tree, radix_tree_node_t *node)
+{
+ return (node->height == tree->height);
+}
+
+static inline uint64_t
+radix_tree_size(radix_tree_t *tree)
+{
+ return tree->size + tree->nodes * sizeof(radix_tree_node_t);
+}
+
+static inline void
+radix_tree_clear_link(radix_tree_link_t *link)
+{
+ if (link)
+ memset(link, 0, sizeof(radix_tree_link_t));
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_node(radix_tree_t *tree, int height)
+{
+ radix_tree_node_t *node;
+
+ node = calloc(1, sizeof(radix_tree_node_t));
+ if (!node)
+ return NULL;
+
+ node->height = height;
+ tree->nodes++;
+
+ return node;
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_child_node(radix_tree_t *tree, radix_tree_node_t *parent)
+{
+ return radix_tree_allocate_node(tree, parent->height - 1);
+}
+
+void
+radix_tree_free_node(radix_tree_t *tree, radix_tree_node_t *node)
+{
+ if (!node)
+ return;
+
+ free(node);
+ tree->nodes--;
+}
+
+static inline radix_tree_page_t *
+radix_tree_allocate_page(radix_tree_t *tree,
+ char *buf, uint64_t sec, size_t size)
+{
+ radix_tree_page_t *page;
+
+ page = calloc(1, sizeof(radix_tree_page_t));
+ if (!page)
+ return NULL;
+
+ page->buf = buf;
+ page->sec = sec;
+ page->size = size;
+ tree->size += size;
+
+ return page;
+}
+
+static inline void
+radix_tree_free_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+ int i;
+
+ for (i = 0; i < page->size >> RADIX_TREE_NODE_SHIFT; i++)
+ DBG("%s: ejecting sector 0x%llx\n",
+ tree->cache->name, page->sec + i);
+
+ tree->cache->stats.prunes += (page->size >> RADIX_TREE_NODE_SHIFT);
+ tree->size -= page->size;
+ free(page->buf);
+ free(page);
+}
+
+/*
+ * remove a leaf and the shared radix_tree_page_t containing its buffer.
+ * leaves are deleted, nodes are not; gc will reap the nodes later.
+ */
+static void
+radix_tree_remove_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+ int i;
+
+ if (!page)
+ return;
+
+ for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++)
+ radix_tree_clear_link(page->owners[i]);
+
+ radix_tree_free_page(tree, page);
+}
+
+static void
+radix_tree_insert_leaf(radix_tree_t *tree, radix_tree_link_t *link,
+ radix_tree_page_t *page, off_t off)
+{
+ int i;
+
+ if (off + RADIX_TREE_NODE_SIZE > page->size)
+ return;
+
+ for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) {
+ if (page->owners[i])
+ continue;
+
+ page->owners[i] = link;
+ link->u.leaf.page = page;
+ link->u.leaf.buf = page->buf + off;
+
+ break;
+ }
+}
+
+static char *
+radix_tree_find_leaf(radix_tree_t *tree, uint64_t sector)
+{
+ int idx;
+ struct timeval now;
+ radix_tree_link_t *link;
+ radix_tree_node_t *node;
+
+ node = tree->root;
+ gettimeofday(&now, NULL);
+
+ do {
+ idx = radix_tree_index(node, sector);
+ link = node->links + idx;
+ link->time = now.tv_sec;
+
+ if (radix_tree_node_contains_leaves(tree, node))
+ return link->u.leaf.buf;
+
+ if (!link->u.next)
+ return NULL;
+
+ node = link->u.next;
+ } while (1);
+}
+
+static char *
+radix_tree_add_leaf(radix_tree_t *tree, uint64_t sector,
+ radix_tree_page_t *page, off_t off)
+{
+ int idx;
+ struct timeval now;
+ radix_tree_link_t *link;
+ radix_tree_node_t *node;
+
+ node = tree->root;
+ gettimeofday(&now, NULL);
+
+ do {
+ idx = radix_tree_index(node, sector);
+ link = node->links + idx;
+ link->time = now.tv_sec;
+
+ if (radix_tree_node_contains_leaves(tree, node)) {
+ radix_tree_remove_page(tree, link->u.leaf.page);
+ radix_tree_insert_leaf(tree, link, page, off);
+ return link->u.leaf.buf;
+ }
+
+ if (!link->u.next) {
+ link->u.next = radix_tree_allocate_child_node(tree,
+ node);
+ if (!link->u.next)
+ return NULL;
+ }
+
+ node = link->u.next;
+ } while (1);
+}
+
+static int
+radix_tree_add_leaves(radix_tree_t *tree, char *buf,
+ uint64_t sector, uint64_t sectors)
+{
+ int i;
+ radix_tree_page_t *page;
+
+ page = radix_tree_allocate_page(tree, buf, sector,
+ sectors << RADIX_TREE_NODE_SHIFT);
+ if (!page)
+ return -ENOMEM;
+
+ for (i = 0; i < sectors; i++)
+ if (!radix_tree_add_leaf(tree, sector + i,
+ page, (i << RADIX_TREE_NODE_SHIFT)))
+ goto fail;
+
+ return 0;
+
+fail:
+ page->buf = NULL;
+ radix_tree_remove_page(tree, page);
+ return -ENOMEM;
+}
+
+static void
+radix_tree_delete_branch(radix_tree_t *tree, radix_tree_node_t *node)
+{
+ int i;
+ radix_tree_link_t *link;
+
+ if (!node)
+ return;
+
+ for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+ link = node->links + i;
+
+ if (radix_tree_node_contains_leaves(tree, node))
+ radix_tree_remove_page(tree, link->u.leaf.page);
+ else
+ radix_tree_delete_branch(tree, link->u.next);
+
+ radix_tree_clear_link(link);
+ }
+
+ radix_tree_free_node(tree, node);
+}
+
+static inline void
+radix_tree_destroy(radix_tree_t *tree)
+{
+ radix_tree_delete_branch(tree, tree->root);
+ tree->root = NULL;
+}
+
+/*
+ * returns 1 if @node is empty after pruning, 0 otherwise
+ */
+static int
+radix_tree_prune_branch(radix_tree_t *tree,
+ radix_tree_node_t *node, uint32_t now)
+{
+ int i, empty;
+ radix_tree_link_t *link;
+
+ empty = 1;
+ if (!node)
+ return empty;
+
+ for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+ link = node->links + i;
+
+ if (now - link->time < BLOCK_CACHE_PAGE_IDLETIME) {
+ if (radix_tree_node_contains_leaves(tree, node)) {
+ empty = 0;
+ continue;
+ }
+
+ if (radix_tree_prune_branch(tree, link->u.next, now))
+ radix_tree_clear_link(link);
+ else
+ empty = 0;
+
+ continue;
+ }
+
+ if (radix_tree_node_contains_leaves(tree, node))
+ radix_tree_remove_page(tree, link->u.leaf.page);
+ else
+ radix_tree_delete_branch(tree, link->u.next);
+
+ radix_tree_clear_link(link);
+ }
+
+ if (empty && !radix_tree_node_is_root(tree, node))
+ radix_tree_free_node(tree, node);
+
+ return empty;
+}
+
+/*
+ * walk tree and free any node that has been idle for too long
+ */
+static void
+radix_tree_prune(radix_tree_t *tree)
+{
+ struct timeval now;
+
+ if (!tree->root)
+ return;
+
+ DPRINTF("tree %s has %"PRIu64" bytes\n",
+ tree->cache->name, tree->size);
+
+ gettimeofday(&now, NULL);
+ radix_tree_prune_branch(tree, tree->root, now.tv_sec);
+
+ DPRINTF("tree %s now has %"PRIu64" bytes\n",
+ tree->cache->name, tree->size);
+}
+
+static inline int
+radix_tree_initialize(radix_tree_t *tree, uint64_t sectors)
+{
+ tree->height = radix_tree_calculate_height(sectors);
+ tree->root = radix_tree_allocate_node(tree, tree->height);
+ if (!tree->root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static inline void
+radix_tree_free(radix_tree_t *tree)
+{
+ radix_tree_destroy(tree);
+}
+
+static void
+block_cache_prune_event(event_id_t id, char mode, void *private)
+{
+ radix_tree_t *tree;
+ block_cache_t *cache;
+
+ cache = (block_cache_t *)private;
+ tree = &cache->tree;
+
+ radix_tree_prune(tree);
+}
+
+static inline block_cache_request_t *
+block_cache_get_request(block_cache_t *cache)
+{
+ if (!cache->requests_free)
+ return NULL;
+
+ return cache->request_free_list[--cache->requests_free];
+}
+
+static inline void
+block_cache_put_request(block_cache_t *cache, block_cache_request_t *breq)
+{
+ memset(breq, 0, sizeof(block_cache_request_t));
+ cache->request_free_list[cache->requests_free++] = breq;
+}
+
+static int
+block_cache_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+ int i, err;
+ radix_tree_t *tree;
+ block_cache_t *cache;
+
+ if (!td_flag_test(flags, TD_OPEN_RDONLY))
+ return -EINVAL;
+
+ if (driver->info.sector_size != RADIX_TREE_NODE_SIZE)
+ return -EINVAL;
+
+ cache = (block_cache_t *)driver->data;
+ err = tapdisk_namedup(&cache->name, (char *)name);
+ if (err)
+ return -ENOMEM;
+
+ cache->sectors = driver->info.size;
+
+ tree = &cache->tree;
+ err = radix_tree_initialize(tree, cache->sectors);
+ if (err)
+ goto fail;
+
+ tree->cache = cache;
+ cache->requests_free = BLOCK_CACHE_REQUESTS;
+ for (i = 0; i < BLOCK_CACHE_REQUESTS; i++)
+ cache->request_free_list[i] = cache->requests + i;
+
+ cache->timeout_id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
+ -1, /* dummy fd */
+ BLOCK_CACHE_PAGE_IDLETIME << 1,
+ block_cache_prune_event,
+ cache);
+ if (cache->timeout_id < 0)
+ goto fail;
+
+ DPRINTF("opening cache for %s, sectors: %"PRIu64", "
+ "tree: %p, height: %d\n",
+ cache->name, cache->sectors, tree, tree->height);
+
+ if (mlockall(MCL_CURRENT | MCL_FUTURE))
+ DPRINTF("mlockall failed: %d\n", -errno);
+
+ return 0;
+
+fail:
+ free(cache->name);
+ radix_tree_free(&cache->tree);
+ return err;
+}
+
+static int
+block_cache_close(td_driver_t *driver)
+{
+ radix_tree_t *tree;
+ block_cache_t *cache;
+
+ cache = (block_cache_t *)driver->data;
+ tree = &cache->tree;
+
+ DPRINTF("closing cache for %s\n", cache->name);
+
+ tapdisk_server_unregister_event(cache->timeout_id);
+ radix_tree_free(tree);
+ free(cache->name);
+
+ return 0;
+}
+
+static inline uint64_t
+block_cache_hash(block_cache_t *cache, char *buf)
+{
+ int i, n;
+ uint64_t cksm, *data;
+
+ return 0;
+
+ cksm = 0;
+ data = (uint64_t *)buf;
+ n = RADIX_TREE_NODE_SIZE / sizeof(uint64_t);
+
+ for (i = 0; i < n; i++)
+ cksm += data[i];
+
+ return ~cksm;
+}
+
+static void
+block_cache_hit(block_cache_t *cache, td_request_t treq, char *iov[])
+{
+ int i;
+ off_t off;
+
+ cache->stats.hits += treq.secs;
+
+ for (i = 0; i < treq.secs; i++) {
+ DBG("%s: block cache hit: sec 0x%08llx, hash: 0x%08llx\n",
+ cache->name, treq.sec + i, block_cache_hash(cache, iov[i]));
+
+ off = i << RADIX_TREE_NODE_SHIFT;
+ memcpy(treq.buf + off, iov[i], RADIX_TREE_NODE_SIZE);
+ }
+
+ td_complete_request(treq, 0);
+}
+
+static void
+block_cache_populate_cache(td_request_t clone, int err)
+{
+ int i;
+ radix_tree_t *tree;
+ block_cache_t *cache;
+ block_cache_request_t *breq;
+
+ breq = (block_cache_request_t *)clone.cb_data;
+ cache = breq->cache;
+ tree = &cache->tree;
+ breq->secs -= clone.secs;
+ breq->err = (breq->err ? breq->err : err);
+
+ if (breq->secs)
+ return;
+
+ if (breq->err) {
+ free(breq->buf);
+ goto out;
+ }
+
+ for (i = 0; i < breq->treq.secs; i++) {
+ off_t off = i << RADIX_TREE_NODE_SHIFT;
+ DBG("%s: populating sec 0x%08llx\n",
+ cache->name, breq->treq.sec + i);
+ memcpy(breq->treq.buf + off,
+ breq->buf + off, RADIX_TREE_NODE_SIZE);
+ }
+
+ if (radix_tree_add_leaves(tree, breq->buf,
+ breq->treq.sec, breq->treq.secs))
+ free(breq->buf);
+
+out:
+ td_complete_request(breq->treq, breq->err);
+ block_cache_put_request(cache, breq);
+}
+
+static void
+block_cache_miss(block_cache_t *cache, td_request_t treq)
+{
+ char *buf;
+ size_t size;
+ td_request_t clone;
+ radix_tree_t *tree;
+ block_cache_request_t *breq;
+
+ DBG("%s: block cache miss: sec 0x%08llx\n", cache->name, treq.sec);
+
+ clone = treq;
+ tree = &cache->tree;
+ size = treq.secs << RADIX_TREE_NODE_SHIFT;
+
+ cache->stats.misses += treq.secs;
+
+ if (radix_tree_size(tree) + size >= BLOCK_CACHE_MAX_SIZE)
+ goto out;
+
+ breq = block_cache_get_request(cache);
+ if (!breq)
+ goto out;
+
+ if (posix_memalign((void **)&buf, RADIX_TREE_NODE_SIZE, size)) {
+ block_cache_put_request(cache, breq);
+ goto out;
+ }
+
+ breq->treq = treq;
+ breq->secs = treq.secs;
+ breq->err = 0;
+ breq->buf = buf;
+ breq->cache = cache;
+
+ clone.buf = buf;
+ clone.cb = block_cache_populate_cache;
+ clone.cb_data = breq;
+
+out:
+ td_forward_request(clone);
+}
+
+static void
+block_cache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ int i;
+ radix_tree_t *tree;
+ block_cache_t *cache;
+ char *iov[BLOCK_CACHE_NODES_PER_PAGE];
+
+ cache = (block_cache_t *)driver->data;
+ tree = &cache->tree;
+
+ cache->stats.reads += treq.secs;
+
+ if (treq.secs > BLOCK_CACHE_NODES_PER_PAGE)
+ return td_forward_request(treq);
+
+ for (i = 0; i < treq.secs; i++) {
+ iov[i] = radix_tree_find_leaf(tree, treq.sec + i);
+ if (!iov[i])
+ return block_cache_miss(cache, treq);
+ }
+
+ return block_cache_hit(cache, treq, iov);
+}
+
+static void
+block_cache_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ td_complete_request(treq, -EPERM);
+}
+
+static int
+block_cache_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ return -EINVAL;
+}
+
+static int
+block_cache_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ block_cache_t *cache;
+
+ if (!td_flag_test(pdriver->state, TD_DRIVER_RDONLY))
+ return -EINVAL;
+
+ cache = (block_cache_t *)driver->data;
+ if (strcmp(driver->name, pdriver->name))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void
+block_cache_debug(td_driver_t *driver)
+{
+ block_cache_t *cache;
+ block_cache_stats_t *stats;
+
+ cache = (block_cache_t *)driver->data;
+ stats = &cache->stats;
+
+ WARN("BLOCK CACHE %s\n", cache->name);
+ WARN("reads: %"PRIu64", hits: %"PRIu64", misses: %"PRIu64", prunes: %"PRIu64"\n",
+ stats->reads, stats->hits, stats->misses, stats->prunes);
+}
+
+struct tap_disk tapdisk_block_cache = {
+ .disk_type = "tapdisk_block_cache",
+ .flags = 0,
+ .private_data_size = sizeof(block_cache_t),
+ .td_open = block_cache_open,
+ .td_close = block_cache_close,
+ .td_queue_read = block_cache_queue_read,
+ .td_queue_write = block_cache_queue_write,
+ .td_get_parent_id = block_cache_get_parent_id,
+ .td_validate_parent = block_cache_validate_parent,
+ .td_debug = block_cache_debug,
+};
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver to sit on top of another disk and log writes, in order
+ * to synchronize two distinct disks
+ *
+ * On receipt of a control request it can export a list of dirty
+ * sectors in the following format:
+ * struct writerange {
+ * u64 sector;
+ * u32 count;
+ * }
+ * terminated by { 0, 0 }
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "xc_bitops.h"
+#include "log.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_CONNECTIONS 1
+
+typedef struct poll_fd {
+ int fd;
+ event_id_t id;
+} poll_fd_t;
+
+struct tdlog_state {
+ uint64_t size;
+
+ void* writelog;
+
+ char* ctlpath;
+ poll_fd_t ctl;
+
+ int connected;
+ poll_fd_t connections[MAX_CONNECTIONS];
+
+ char* shmpath;
+ void* shm;
+
+ log_sring_t* sring;
+ log_back_ring_t bring;
+};
+
+#define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a)
+
+#define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a)
+
+static void ctl_accept(event_id_t, char, void *);
+static void ctl_request(event_id_t, char, void *);
+
+/* -- write log -- */
+
+/* large flat bitmaps don't scale particularly well either in size or scan
+ * time, but they'll do for now */
+
+static int writelog_create(struct tdlog_state *s)
+{
+ uint64_t bmsize;
+
+ bmsize = bitmap_size(s->size);
+
+ BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize);
+
+ s->writelog = bitmap_alloc(s->size);
+ if (!s->writelog) {
+ BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int writelog_free(struct tdlog_state *s)
+{
+ if (s->writelog)
+ free(s->writelog);
+
+ return 0;
+}
+
+static int writelog_set(struct tdlog_state* s, uint64_t sector, int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++)
+ set_bit(sector + i, s->writelog);
+
+ return 0;
+}
+
+/* if end is 0, clear to end of disk */
+int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end)
+{
+ if (!end)
+ end = s->size;
+
+ /* clear to word boundaries */
+ while (BITMAP_SHIFT(start))
+ clear_bit(start++, s->writelog);
+ while (BITMAP_SHIFT(end))
+ clear_bit(end--, s->writelog);
+
+ memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3);
+
+ return 0;
+}
+
+/* returns last block exported (may not be end of disk if shm region
+ * overflows) */
+static uint64_t writelog_export(struct tdlog_state* s)
+{
+ struct disk_range* range = s->shm;
+ uint64_t i = 0;
+
+ BDPRINTF("sector count: %"PRIu64, s->size);
+
+ for (i = 0; i < s->size; i++) {
+ if (test_bit(i, s->writelog)) {
+ /* range start */
+ range->sector = i;
+ range->count = 1;
+ /* find end */
+ for (i++; i < s->size && test_bit(i, s->writelog); i++)
+ range->count++;
+
+ BDPRINTF("export: dirty extent %"PRIu64":%u",
+ range->sector, range->count);
+ range++;
+
+ /* out of space in shared memory region */
+ if ((void*)range >= bmend(s->shm)) {
+ BDPRINTF("out of space in shm region at sector %"PRIu64, i);
+ return i;
+ }
+
+ /* undo forloop increment */
+ i--;
+ }
+ }
+
+ /* NULL-terminate range list */
+ range->sector = 0;
+ range->count = 0;
+
+ return i;
+}
+
+/* -- communication channel -- */
+
+/* remove FS special characters in up to len bytes of path */
+static inline void path_escape(char* path, size_t len) {
+ int i;
+
+ for (i = 0; i < len && path[i]; i++)
+ if (strchr(":/", path[i]))
+ path[i] = '_';
+}
+
+static char* ctl_makepath(const char* name, const char* ext)
+{
+ char* res;
+ char *file;
+
+ file = strrchr(name, '/');
+ if (!file) {
+ BWPRINTF("invalid name %s\n", name);
+ return NULL;
+ }
+
+ if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) {
+ BWPRINTF("could not allocate path");
+ return NULL;
+ }
+
+ path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file));
+
+ return res;
+}
+
+static int shmem_open(struct tdlog_state* s, const char* name)
+{
+ int i, l, fd;
+
+ /* device name -> path */
+ if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) {
+ BWPRINTF("could not allocate shm path");
+ return -1;
+ }
+
+ path_escape(s->shmpath + 5, strlen(name));
+
+ if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) {
+ BWPRINTF("could not open shared memory file %s: %s", s->shmpath,
+ strerror(errno));
+ goto err;
+ }
+ if (ftruncate(fd, SHMSIZE) < 0) {
+ BWPRINTF("error truncating shmem to size %u", SHMSIZE);
+ close(fd);
+ goto err;
+ }
+
+ s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (s->shm == MAP_FAILED) {
+ BWPRINTF("could not mmap write log shm: %s", strerror(errno));
+ goto err;
+ }
+ return 0;
+
+ err:
+ s->shm = NULL;
+ free(s->shmpath);
+ s->shmpath = NULL;
+ return -1;
+}
+
+static int shmem_close(struct tdlog_state* s)
+{
+ if (s->shm) {
+ munmap(s->shm, SHMSIZE);
+ s->shm = NULL;
+ }
+
+ if (s->shmpath) {
+ shm_unlink(s->shmpath);
+ s->shmpath = NULL;
+ }
+
+ return 0;
+}
+
+/* control socket */
+
+static int ctl_open(struct tdlog_state* s, const char* name)
+{
+ struct sockaddr_un saddr;
+
+ if (!(s->ctlpath = ctl_makepath(name, "ctl")))
+ return -1;
+
+ if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ BWPRINTF("error opening control socket: %s", strerror(errno));
+ goto err;
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sun_family = AF_UNIX;
+ memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath));
+ if (unlink(s->ctlpath) && errno != ENOENT) {
+ BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath,
+ strerror(errno));
+ goto err_sock;
+ }
+
+ if (bind(s->ctl.fd, (const struct sockaddr *)&saddr, sizeof(saddr)) < 0) {
+ BWPRINTF("error binding control socket to %s: %s", s->ctlpath,
+ strerror(errno));
+ goto err_sock;
+ }
+
+ if (listen(s->ctl.fd, 1) < 0) {
+ BWPRINTF("error listening on control socket: %s", strerror(errno));
+ goto err_sock;
+ }
+
+ s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ s->ctl.fd, 0, ctl_accept, s);
+ if (s->ctl.id < 0) {
+ BWPRINTF("error register event handler: %s", strerror(s->ctl.id));
+ goto err_sock;
+ }
+
+ return 0;
+
+ err_sock:
+ close(s->ctl.fd);
+ s->ctl.fd = -1;
+ err:
+ free(s->ctlpath);
+ s->ctlpath = NULL;
+
+ return -1;
+}
+
+static int ctl_close(struct tdlog_state* s)
+{
+ while (s->connected) {
+ s->connected--;
+ tapdisk_server_unregister_event(s->connections[s->connected].id);
+ close(s->connections[s->connected].fd);
+ s->connections[s->connected].fd = -1;
+ s->connections[s->connected].id = 0;
+ }
+
+ if (s->ctl.fd >= 0) {
+ tapdisk_server_unregister_event(s->ctl.id);
+ close(s->ctl.fd);
+ s->ctl.fd = -1;
+ s->ctl.id = 0;
+ }
+
+ if (s->ctlpath) {
+ unlink(s->ctlpath);
+ free(s->ctlpath);
+ s->ctlpath = NULL;
+ }
+
+ /* XXX this must be fixed once requests are actually in flight */
+ /* could just drain the existing ring here first */
+ if (s->sring) {
+ SHARED_RING_INIT(s->sring);
+ BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+ }
+
+ return 0;
+}
+
+/* walk list of open sockets, close matching fd */
+static int ctl_close_sock(struct tdlog_state* s, int fd)
+{
+ int i;
+
+ for (i = 0; i < s->connected; i++) {
+ if (s->connections[i].fd == fd) {
+ tapdisk_server_unregister_event(s->connections[i].id);
+ close(s->connections[i].fd);
+ s->connections[i].fd = -1;
+ s->connections[i].id = 0;
+ s->connected--;
+ return 0;
+ }
+ }
+
+ BWPRINTF("requested to close unknown socket %d", fd);
+ return -1;
+}
+
+static void ctl_accept(event_id_t id, char mode, void *private)
+{
+ struct tdlog_state* s = (struct tdlog_state *)private;
+ int fd;
+ event_id_t cid;
+
+ if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) {
+ BWPRINTF("error accepting control connection: %s", strerror(errno));
+ return;
+ }
+
+ if (s->connected) {
+ BWPRINTF("control session in progress, closing new connection");
+ close(fd);
+ return;
+ }
+
+ cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ fd, 0, ctl_request, s);
+ if (cid < 0) {
+ BWPRINTF("error registering connection event handler: %s", strerror(cid));
+ close(fd);
+ return;
+ }
+
+ s->connections[s->connected].fd = fd;
+ s->connections[s->connected].id = cid;
+ s->connected++;
+}
+
+/* response format: 4 bytes shmsize, 0-terminated path */
+static int ctl_get_shmpath(struct tdlog_state* s, int fd)
+{
+ char msg[CTLRSPLEN_SHMP + 1];
+ uint32_t sz;
+ int rc;
+
+ BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)",
+ SHMSIZE, s->shmpath);
+
+ /* TMP: sanity-check shm */
+ sz = 0xdeadbeef;
+ memcpy(s->shm, &sz, sizeof(sz));
+
+ sz = SHMSIZE;
+ memcpy(msg, &sz, sizeof(sz));
+ snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath);
+ if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) {
+ BWPRINTF("error writing shmpath: %s", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_peek_writes(struct tdlog_state* s, int fd)
+{
+ int rc;
+
+ BDPRINTF("ctl: peeking bitmap");
+
+ writelog_export(s);
+
+ if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) {
+ BWPRINTF("error writing peek ack: %s", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_clear_writes(struct tdlog_state* s, int fd)
+{
+ int rc;
+
+ BDPRINTF("ctl: clearing bitmap");
+
+ writelog_clear(s, 0, 0);
+
+ if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) {
+ BWPRINTF("error writing clear ack: %s", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+/* get dirty bitmap and clear it atomically */
+static int ctl_get_writes(struct tdlog_state* s, int fd)
+{
+ int rc;
+
+ BDPRINTF("ctl: getting bitmap");
+
+ writelog_export(s);
+ writelog_clear(s, 0, 0);
+
+ if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) {
+ BWPRINTF("error writing get ack: %s", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+/* get requests from ring */
+static int ctl_kick(struct tdlog_state* s, int fd)
+{
+ RING_IDX reqstart, reqend;
+ log_request_t req;
+
+ /* XXX testing */
+ RING_IDX rspstart, rspend;
+ log_response_t rsp;
+ struct log_ctlmsg msg;
+ int rc;
+
+ reqstart = s->bring.req_cons;
+ reqend = s->sring->req_prod;
+
+ BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend);
+
+ while (reqstart != reqend) {
+ /* XXX actually submit these! */
+ memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req));
+ BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count);
+ s->bring.req_cons = ++reqstart;
+
+ rsp.sector = req.sector;
+ rsp.count = req.count;
+ memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp,
+ sizeof(rsp));
+ s->bring.rsp_prod_pvt++;
+ }
+
+ RING_PUSH_RESPONSES(&s->bring);
+ memset(&msg, 0, sizeof(msg));
+ memcpy(msg.msg, LOGCMD_KICK, 4);
+ if ((rc = write(fd, &msg, sizeof(msg))) < 0) {
+ BWPRINTF("error sending notify: %s", strerror(errno));
+ return -1;
+ } else if (rc < sizeof(msg)) {
+ BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* msg)
+{
+ if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) {
+ return ctl_get_shmpath(s, fd);
+ } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) {
+ return ctl_peek_writes(s, fd);
+ } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) {
+ return ctl_clear_writes(s, fd);
+ } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) {
+ return ctl_get_writes(s, fd);
+ } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) {
+ return ctl_kick(s, fd);
+ }
+
+ BWPRINTF("unknown control request %.4s", msg->msg);
+ return -1;
+}
+
+static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id)
+{
+ int i;
+
+ for (i = 0; i < s->connected; i++)
+ if (s->connections[i].id == id)
+ return s->connections[i].fd;
+
+ BWPRINTF("unrecognized event callback id %d", id);
+ return -1;
+}
+
+static void ctl_request(event_id_t id, char mode, void *private)
+{
+ struct tdlog_state* s = (struct tdlog_state*)private;
+ struct log_ctlmsg msg;
+ int rc, i, fd = -1;
+
+ fd = ctl_find_connection(s, id);
+ if (fd == -1)
+ return;
+
+ if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
+ BWPRINTF("error reading from ctl socket %d, closing: %s", fd,
+ strerror(errno));
+ ctl_close_sock(s, fd);
+ return;
+ } else if (rc == 0) {
+ BDPRINTF("ctl_request: EOF, closing socket");
+ ctl_close_sock(s, fd);
+ return;
+ } else if (rc < sizeof(msg)) {
+ BWPRINTF("short request received (%d/%zd bytes), ignoring", rc,
+ sizeof(msg));
+ return;
+ }
+
+ ctl_do_request(s, fd, &msg);
+}
+
+/* -- interface -- */
+
+static int tdlog_close(td_driver_t*);
+
+static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags)
+{
+ struct tdlog_state* s = (struct tdlog_state*)driver->data;
+ int rc;
+
+ memset(s, 0, sizeof(*s));
+
+ s->size = driver->info.size;
+
+ if ((rc = writelog_create(s))) {
+ tdlog_close(driver);
+ return rc;
+ }
+ if ((rc = shmem_open(s, name))) {
+ tdlog_close(driver);
+ return rc;
+ }
+ if ((rc = ctl_open(s, name))) {
+ tdlog_close(driver);
+ return rc;
+ }
+
+ s->sring = (log_sring_t*)sringstart(s->shm);
+ SHARED_RING_INIT(s->sring);
+ BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+
+ BDPRINTF("opened ctl socket");
+
+ return 0;
+}
+
+static int tdlog_close(td_driver_t* driver)
+{
+ struct tdlog_state* s = (struct tdlog_state*)driver->data;
+
+ ctl_close(s);
+ shmem_close(s);
+ writelog_free(s);
+
+ return 0;
+}
+
+static void tdlog_queue_read(td_driver_t* driver, td_request_t treq)
+{
+ td_forward_request(treq);
+}
+
+static void tdlog_queue_write(td_driver_t* driver, td_request_t treq)
+{
+ struct tdlog_state* s = (struct tdlog_state*)driver->data;
+ int rc;
+
+ writelog_set(s, treq.sec, treq.secs);
+ td_forward_request(treq);
+}
+
+static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id)
+{
+ return -EINVAL;
+}
+
+static int tdlog_validate_parent(td_driver_t *driver,
+ td_driver_t *parent, td_flag_t flags)
+{
+ return 0;
+}
+
+struct tap_disk tapdisk_log = {
+ .disk_type = "tapdisk_log",
+ .private_data_size = sizeof(struct tdlog_state),
+ .flags = 0,
+ .td_open = tdlog_open,
+ .td_close = tdlog_close,
+ .td_queue_read = tdlog_queue_read,
+ .td_queue_write = tdlog_queue_write,
+ .td_get_parent_id = tdlog_get_parent_id,
+ .td_validate_parent = tdlog_validate_parent,
+};
--- /dev/null
+/* block-qcow.c
+ *
+ * Asynchronous Qemu copy-on-write disk implementation.
+ * Code based on the Qemu implementation
+ * (see copyright notice below)
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ */
+
+/*
+ * Block driver for the QCOW format
+ *
+ * Copyright (c) 2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files(the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <zlib.h>
+#include <inttypes.h>
+#include <libaio.h>
+#include <limits.h>
+#include "bswap.h"
+#include "aes.h"
+#include "md5.h"
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+#include "qcow.h"
+#include "blk.h"
+#include "atomicio.h"
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+#if 1
+#define ASSERT(_p) \
+ if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
+ __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif
+
+struct pending_aio {
+ td_callback_t cb;
+ int id;
+ void *private;
+ int nb_sectors;
+ char *buf;
+ uint64_t sector;
+};
+
+#undef IOCB_IDX
+#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
+
+#define ZERO_TEST(_b) (_b | 0x00)
+
+struct qcow_request {
+ td_request_t treq;
+ struct tiocb tiocb;
+ struct tdqcow_state *state;
+};
+
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
+
+uint32_t gen_cksum(char *ptr, int len)
+{
+ int i;
+ uint32_t md[4];
+
+ /* Generate checksum */
+ md5_sum((const uint8_t*)ptr, len, (uint8_t*)md);
+
+ return md[0];
+}
+
+static void free_aio_state(struct tdqcow_state* s)
+{
+ free(s->aio_requests);
+ free(s->aio_free_list);
+}
+
+static int init_aio_state(td_driver_t *driver)
+{
+ int i, ret;
+ td_disk_info_t *bs = &(driver->info);
+ struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+
+ // A segment (i.e. a page) can span multiple clusters
+ s->max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
+ MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
+
+ s->aio_free_count = s->max_aio_reqs;
+
+ if (!(s->aio_requests = calloc(s->max_aio_reqs, sizeof(struct qcow_request))) ||
+ !(s->aio_free_list = calloc(s->max_aio_reqs, sizeof(struct qcow_request)))) {
+ DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n",
+ s->max_aio_reqs);
+ goto fail;
+ }
+
+ for (i = 0; i < s->max_aio_reqs; i++)
+ s->aio_free_list[i] = &s->aio_requests[i];
+
+ DPRINTF("AIO state initialised\n");
+
+ return 0;
+ fail:
+ return -1;
+}
+
+int get_filesize(char *filename, uint64_t *size, struct stat *st)
+{
+ int fd;
+ QCowHeader header;
+
+ /*Set to the backing file size*/
+ fd = open(filename, O_RDONLY);
+ if (fd < 0)
+ return -1;
+ if (read(fd, &header, sizeof(header)) < sizeof(header)) {
+ close(fd);
+ return -1;
+ }
+ close(fd);
+
+ be32_to_cpus(&header.magic);
+ be64_to_cpus(&header.size);
+ if (header.magic == QCOW_MAGIC) {
+ *size = header.size >> SECTOR_SHIFT;
+ return 0;
+ }
+
+ if(S_ISBLK(st->st_mode)) {
+ fd = open(filename, O_RDONLY);
+ if (fd < 0)
+ return -1;
+ if (blk_getimagesize(fd, size) != 0) {
+ printf("Unable to get Block device size\n");
+ close(fd);
+ return -1;
+ }
+ close(fd);
+ } else *size = (st->st_size >> SECTOR_SHIFT);
+ return 0;
+}
+
+static int qcow_set_key(struct tdqcow_state *s, const char *key)
+{
+ uint8_t keybuf[16];
+ int len, i;
+
+ memset(keybuf, 0, 16);
+ len = strlen(key);
+ if (len > 16)
+ len = 16;
+ /* XXX: we could compress the chars to 7 bits to increase
+ entropy */
+ for (i = 0; i < len; i++) {
+ keybuf[i] = key[i];
+ }
+ s->crypt_method = s->crypt_method_header;
+
+ if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+ return -1;
+ if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+ return -1;
+#if 0
+ /* test */
+ {
+ uint8_t in[16];
+ uint8_t out[16];
+ uint8_t tmp[16];
+ for (i=0; i<16; i++)
+ in[i] = i;
+ AES_encrypt(in, tmp, &s->aes_encrypt_key);
+ AES_decrypt(tmp, out, &s->aes_decrypt_key);
+ for (i = 0; i < 16; i++)
+ DPRINTF(" %02x", tmp[i]);
+ DPRINTF("\n");
+ for (i = 0; i < 16; i++)
+ DPRINTF(" %02x", out[i]);
+ DPRINTF("\n");
+ }
+#endif
+ return 0;
+}
+
+void tdqcow_complete(void *arg, struct tiocb *tiocb, int err)
+{
+ struct qcow_request *aio = (struct qcow_request *)arg;
+ struct tdqcow_state *s = aio->state;
+
+ td_complete_request(aio->treq, err);
+
+ s->aio_free_list[s->aio_free_count++] = aio;
+}
+
+static void async_read(td_driver_t *driver, td_request_t treq)
+{
+ int size;
+ uint64_t offset;
+ struct qcow_request *aio;
+ struct tdqcow_state *prv;
+
+ prv = (struct tdqcow_state *)driver->data;
+ size = treq.secs * driver->info.sector_size;
+ offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ if (prv->aio_free_count == 0)
+ goto fail;
+
+ aio = prv->aio_free_list[--prv->aio_free_count];
+ aio->treq = treq;
+ aio->state = prv;
+
+ td_prep_read(&aio->tiocb, prv->fd, treq.buf,
+ size, offset, tdqcow_complete, aio);
+ td_queue_tiocb(driver, &aio->tiocb);
+
+ return;
+
+fail:
+ td_complete_request(treq, -EBUSY);
+}
+
+static void async_write(td_driver_t *driver, td_request_t treq)
+{
+ int size;
+ uint64_t offset;
+ struct qcow_request *aio;
+ struct tdqcow_state *prv;
+
+ prv = (struct tdqcow_state *)driver->data;
+ size = treq.secs * driver->info.sector_size;
+ offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ if (prv->aio_free_count == 0)
+ goto fail;
+
+ aio = prv->aio_free_list[--prv->aio_free_count];
+ aio->treq = treq;
+ aio->state = prv;
+
+ td_prep_write(&aio->tiocb, prv->fd, treq.buf,
+ size, offset, tdqcow_complete, aio);
+ td_queue_tiocb(driver, &aio->tiocb);
+
+ return;
+
+fail:
+ td_complete_request(treq, -EBUSY);
+}
+
+/*
+ * The crypt function is compatible with the linux cryptoloop
+ * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ * supported .
+ */
+static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, int enc,
+ const AES_KEY *key)
+{
+ union {
+ uint64_t ll[2];
+ uint8_t b[16];
+ } ivec;
+ int i;
+
+ for (i = 0; i < nb_sectors; i++) {
+ ivec.ll[0] = cpu_to_le64(sector_num);
+ ivec.ll[1] = 0;
+ AES_cbc_encrypt(in_buf, out_buf, 512, key,
+ ivec.b, enc);
+ sector_num++;
+ in_buf += 512;
+ out_buf += 512;
+ }
+}
+
+int qtruncate(int fd, off_t length, int sparse)
+{
+ int ret, i;
+ int current = 0, rem = 0;
+ uint64_t sectors;
+ struct stat st;
+ char *buf;
+
+ /* If length is greater than the current file len
+ * we synchronously write zeroes to the end of the
+ * file, otherwise we truncate the length down
+ */
+ ret = fstat(fd, &st);
+ if (ret == -1)
+ return -1;
+ if (S_ISBLK(st.st_mode))
+ return 0;
+
+ sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
+ current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
+ rem = st.st_size % DEFAULT_SECTOR_SIZE;
+
+ /* If we are extending this file, we write zeros to the end --
+ * this tries to ensure that the extents allocated wind up being
+ * contiguous on disk.
+ */
+ if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
+ /*We are extending the file*/
+ if ((ret = posix_memalign((void **)&buf,
+ 512, DEFAULT_SECTOR_SIZE))) {
+ DPRINTF("posix_memalign failed: %d\n", ret);
+ return -1;
+ }
+ memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
+ if (lseek(fd, 0, SEEK_END)==-1) {
+ DPRINTF("Lseek EOF failed (%d), internal error\n",
+ errno);
+ free(buf);
+ return -1;
+ }
+ if (rem) {
+ ret = write(fd, buf, rem);
+ if (ret != rem) {
+ DPRINTF("write failed: ret = %d, err = %s\n",
+ ret, strerror(errno));
+ free(buf);
+ return -1;
+ }
+ }
+ for (i = current; i < sectors; i++ ) {
+ ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
+ if (ret != DEFAULT_SECTOR_SIZE) {
+ DPRINTF("write failed: ret = %d, err = %s\n",
+ ret, strerror(errno));
+ free(buf);
+ return -1;
+ }
+ }
+ free(buf);
+ } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
+ if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
+ DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(struct tdqcow_state *s,
+ uint64_t offset, int allocate,
+ int compressed_size,
+ int n_start, int n_end)
+{
+ int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
+ char *tmp_ptr2, *l2_ptr, *l1_ptr;
+ uint64_t *tmp_ptr;
+ uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+ uint32_t min_count;
+ int new_l2_table;
+
+ /*Check L1 table for the extent offset*/
+ l1_index = offset >> (s->l2_bits + s->cluster_bits);
+ l2_offset = s->l1_table[l1_index];
+ new_l2_table = 0;
+ if (!l2_offset) {
+ if (!allocate)
+ return 0;
+ /*
+ * allocating a new l2 entry + extent
+ * at the end of the file, we must also
+ * update the L1 entry safely.
+ */
+ l2_offset = s->fd_end;
+
+ /* round to cluster size */
+ l2_offset = (l2_offset + s->cluster_size - 1)
+ & ~(s->cluster_size - 1);
+
+ /* update the L1 entry */
+ s->l1_table[l1_index] = l2_offset;
+
+ /*Truncate file for L2 table
+ *(initialised to zero in case we crash)*/
+ if (qtruncate(s->fd,
+ l2_offset + (s->l2_size * sizeof(uint64_t)),
+ s->sparse) != 0) {
+ DPRINTF("ERROR truncating file\n");
+ return 0;
+ }
+ s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
+
+ /*Update the L1 table entry on disk
+ * (for O_DIRECT we write 4KByte blocks)*/
+ l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
+ l1_ptr = (char *)s->l1_table + (l1_sector << 12);
+
+ if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
+ DPRINTF("ERROR allocating memory for L1 table\n");
+ return 0;
+ }
+ memcpy(tmp_ptr, l1_ptr, 4096);
+
+ /* Convert block to write to big endian */
+ for(i = 0; i < 4096 / sizeof(uint64_t); i++) {
+ cpu_to_be64s(&tmp_ptr[i]);
+ }
+
+ /*
+ * Issue non-asynchronous L1 write.
+ * For safety, we must ensure that
+ * entry is written before blocks.
+ */
+ lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
+ if (write(s->fd, tmp_ptr, 4096) != 4096) {
+ free(tmp_ptr);
+ return 0;
+ }
+ free(tmp_ptr);
+
+ new_l2_table = 1;
+ goto cache_miss;
+ } else if (s->min_cluster_alloc == s->l2_size) {
+ /*Fast-track the request*/
+ cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+ return cluster_offset + (l2_index * s->cluster_size);
+ }
+
+ /*Check to see if L2 entry is already cached*/
+ for (i = 0; i < L2_CACHE_SIZE; i++) {
+ if (l2_offset == s->l2_cache_offsets[i]) {
+ /* increment the hit count */
+ if (++s->l2_cache_counts[i] == 0xffffffff) {
+ for (j = 0; j < L2_CACHE_SIZE; j++) {
+ s->l2_cache_counts[j] >>= 1;
+ }
+ }
+ l2_table = s->l2_cache + (i << s->l2_bits);
+ goto found;
+ }
+ }
+
+cache_miss:
+ /* not found: load a new entry in the least used one */
+ min_index = 0;
+ min_count = 0xffffffff;
+ for (i = 0; i < L2_CACHE_SIZE; i++) {
+ if (s->l2_cache_counts[i] < min_count) {
+ min_count = s->l2_cache_counts[i];
+ min_index = i;
+ }
+ }
+ l2_table = s->l2_cache + (min_index << s->l2_bits);
+
+ /*If extent pre-allocated, read table from disk,
+ *otherwise write new table to disk*/
+ if (new_l2_table) {
+ /*Should we allocate the whole extent? Adjustable parameter.*/
+ if (s->cluster_alloc == s->l2_size) {
+ cluster_offset = l2_offset +
+ (s->l2_size * sizeof(uint64_t));
+ cluster_offset = (cluster_offset + s->cluster_size - 1)
+ & ~(s->cluster_size - 1);
+ if (qtruncate(s->fd, cluster_offset +
+ (s->cluster_size * s->l2_size),
+ s->sparse) != 0) {
+ DPRINTF("ERROR truncating file\n");
+ return 0;
+ }
+ s->fd_end = cluster_offset +
+ (s->cluster_size * s->l2_size);
+ for (i = 0; i < s->l2_size; i++) {
+ l2_table[i] = cpu_to_be64(cluster_offset +
+ (i*s->cluster_size));
+ }
+ } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+
+ lseek(s->fd, l2_offset, SEEK_SET);
+ if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return 0;
+ } else {
+ lseek(s->fd, l2_offset, SEEK_SET);
+ if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return 0;
+ }
+
+ /*Update the cache entries*/
+ s->l2_cache_offsets[min_index] = l2_offset;
+ s->l2_cache_counts[min_index] = 1;
+
+found:
+ /*The extent is split into 's->l2_size' blocks of
+ *size 's->cluster_size'*/
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+ if (!cluster_offset ||
+ ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
+ if (!allocate)
+ return 0;
+
+ if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+ (n_end - n_start) < s->cluster_sectors) {
+ /* cluster is already allocated but compressed, we must
+ decompress it in the case it is not completely
+ overwritten */
+ if (decompress_cluster(s, cluster_offset) < 0)
+ return 0;
+ cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
+ cluster_offset = (cluster_offset + s->cluster_size - 1)
+ & ~(s->cluster_size - 1);
+ /* write the cluster content - not asynchronous */
+ lseek(s->fd, cluster_offset, SEEK_SET);
+ if (write(s->fd, s->cluster_cache, s->cluster_size) !=
+ s->cluster_size)
+ return -1;
+ } else {
+ /* allocate a new cluster */
+ cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
+ if (allocate == 1) {
+ /* round to cluster size */
+ cluster_offset =
+ (cluster_offset + s->cluster_size - 1)
+ & ~(s->cluster_size - 1);
+ if (qtruncate(s->fd, cluster_offset +
+ s->cluster_size, s->sparse)!=0) {
+ DPRINTF("ERROR truncating file\n");
+ return 0;
+ }
+ s->fd_end = (cluster_offset + s->cluster_size);
+ /* if encrypted, we must initialize the cluster
+ content which won't be written */
+ if (s->crypt_method &&
+ (n_end - n_start) < s->cluster_sectors) {
+ uint64_t start_sect;
+ start_sect = (offset &
+ ~(s->cluster_size - 1))
+ >> 9;
+ memset(s->cluster_data + 512,
+ 0xaa, 512);
+ for (i = 0; i < s->cluster_sectors;i++)
+ {
+ if (i < n_start || i >= n_end)
+ {
+ encrypt_sectors(s, start_sect + i,
+ s->cluster_data,
+ s->cluster_data + 512, 1, 1,
+ &s->aes_encrypt_key);
+ lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
+ if (write(s->fd, s->cluster_data, 512) != 512)
+ return -1;
+ }
+ }
+ }
+ } else {
+ cluster_offset |= QCOW_OFLAG_COMPRESSED |
+ (uint64_t)compressed_size
+ << (63 - s->cluster_bits);
+ }
+ }
+ /* update L2 table */
+ tmp = cpu_to_be64(cluster_offset);
+ l2_table[l2_index] = tmp;
+
+ /*For IO_DIRECT we write 4KByte blocks*/
+ l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
+ l2_ptr = (char *)l2_table + (l2_sector << 12);
+
+ if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
+ DPRINTF("ERROR allocating memory for L1 table\n");
+ return 0;
+ }
+ memcpy(tmp_ptr2, l2_ptr, 4096);
+ lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
+ if (write(s->fd, tmp_ptr2, 4096) != 4096) {
+ free(tmp_ptr2);
+ return -1;
+ }
+ free(tmp_ptr2);
+ }
+ return cluster_offset;
+}
+
+static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ int index_in_cluster, n;
+ uint64_t cluster_offset;
+
+ cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+ *pnum = n;
+ return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+ const uint8_t *buf, int buf_size)
+{
+ z_stream strm1, *strm = &strm1;
+ int ret, out_len;
+
+ memset(strm, 0, sizeof(*strm));
+
+ strm->next_in = (uint8_t *)buf;
+ strm->avail_in = buf_size;
+ strm->next_out = out_buf;
+ strm->avail_out = out_buf_size;
+
+ ret = inflateInit2(strm, -12);
+ if (ret != Z_OK)
+ return -1;
+ ret = inflate(strm, Z_FINISH);
+ out_len = strm->next_out - out_buf;
+ if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+ (out_len != out_buf_size) ) {
+ inflateEnd(strm);
+ return -1;
+ }
+ inflateEnd(strm);
+ return 0;
+}
+
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
+{
+ int ret, csize;
+ uint64_t coffset;
+
+ coffset = cluster_offset & s->cluster_offset_mask;
+ if (s->cluster_cache_offset != coffset) {
+ csize = cluster_offset >> (63 - s->cluster_bits);
+ csize &= (s->cluster_size - 1);
+ lseek(s->fd, coffset, SEEK_SET);
+ ret = read(s->fd, s->cluster_data, csize);
+ if (ret != csize)
+ return -1;
+ if (decompress_buffer(s->cluster_cache, s->cluster_size,
+ s->cluster_data, csize) < 0) {
+ return -1;
+ }
+ s->cluster_cache_offset = coffset;
+ }
+ return 0;
+}
+
+static int
+tdqcow_read_header(int fd, QCowHeader *header)
+{
+ int err;
+ char *buf;
+ struct stat st;
+ size_t size, expected;
+
+ memset(header, 0, sizeof(*header));
+
+ err = fstat(fd, &st);
+ if (err)
+ return -errno;
+
+ err = lseek(fd, 0, SEEK_SET);
+ if (err == (off_t)-1)
+ return -errno;
+
+ size = (sizeof(*header) + 511) & ~511;
+ err = posix_memalign((void **)&buf, 512, size);
+ if (err)
+ return err;
+
+ expected = size;
+ if (st.st_size < size)
+ expected = st.st_size;
+
+ errno = 0;
+ err = read(fd, buf, size);
+ if (err != expected) {
+ err = (errno ? -errno : -EIO);
+ goto out;
+ }
+
+ memcpy(header, buf, sizeof(*header));
+ be32_to_cpus(&header->magic);
+ be32_to_cpus(&header->version);
+ be64_to_cpus(&header->backing_file_offset);
+ be32_to_cpus(&header->backing_file_size);
+ be32_to_cpus(&header->mtime);
+ be64_to_cpus(&header->size);
+ be32_to_cpus(&header->crypt_method);
+ be64_to_cpus(&header->l1_table_offset);
+
+ err = 0;
+
+out:
+ free(buf);
+ return err;
+}
+
+static int
+tdqcow_load_l1_table(struct tdqcow_state *s, QCowHeader *header)
+{
+ char *buf;
+ struct stat st;
+ size_t expected;
+ int i, err, shift;
+ QCowHeader_ext *exthdr;
+ uint32_t l1_table_bytes, l1_table_block, l1_table_size;
+
+ buf = NULL;
+ s->l1_table = NULL;
+
+ shift = s->cluster_bits + s->l2_bits;
+
+ s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
+ s->l1_table_offset = header->l1_table_offset;
+
+ s->min_cluster_alloc = 1; /* default */
+
+ l1_table_bytes = s->l1_size * sizeof(uint64_t);
+ l1_table_size = (l1_table_bytes + 4095) & ~4095;
+ l1_table_block = (l1_table_bytes + s->l1_table_offset + 4095) & ~4095;
+
+ DPRINTF("L1 Table offset detected: %"PRIu64", size %d (%d)\n",
+ (uint64_t)s->l1_table_offset,
+ (int) (s->l1_size * sizeof(uint64_t)),
+ l1_table_size);
+
+ err = fstat(s->fd, &st);
+ if (err) {
+ err = -errno;
+ goto out;
+ }
+
+ err = lseek(s->fd, 0, SEEK_SET);
+ if (err == (off_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ err = posix_memalign((void **)&buf, 512, l1_table_block);
+ if (err) {
+ buf = NULL;
+ goto out;
+ }
+
+ err = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
+ if (err) {
+ s->l1_table = NULL;
+ goto out;
+ }
+
+ memset(buf, 0, l1_table_block);
+ memset(s->l1_table, 0, l1_table_size);
+
+ expected = l1_table_block;
+ if (st.st_size < l1_table_block)
+ expected = st.st_size;
+
+ errno = 0;
+ err = read(s->fd, buf, l1_table_block);
+ if (err != expected) {
+ err = (errno ? -errno : -EIO);
+ goto out;
+ }
+
+ memcpy(s->l1_table, buf + s->l1_table_offset, l1_table_size);
+ exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
+
+ /* check for xen extended header */
+ if (s->l1_table_offset % 4096 == 0 &&
+ be32_to_cpu(exthdr->xmagic) == XEN_MAGIC) {
+ uint32_t flags = be32_to_cpu(exthdr->flags);
+ uint32_t cksum = be32_to_cpu(exthdr->cksum);
+
+ /*
+ * Try to detect old tapdisk images. They have to be fixed
+ * because they use big endian rather than native endian for
+ * the L1 table. After this block, the l1 table will
+ * definitely be in BIG endian.
+ */
+ if (!(flags & EXTHDR_L1_BIG_ENDIAN)) {
+ DPRINTF("qcow: converting to big endian L1 table\n");
+
+ /* convert to big endian */
+ for (i = 0; i < s->l1_size; i++)
+ cpu_to_be64s(&s->l1_table[i]);
+
+ flags |= EXTHDR_L1_BIG_ENDIAN;
+ exthdr->flags = cpu_to_be32(flags);
+
+ memcpy(buf + s->l1_table_offset,
+ s->l1_table, l1_table_size);
+
+ err = lseek(s->fd, 0, SEEK_SET);
+ if (err == (off_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ err = atomicio(vwrite, s->fd, buf, l1_table_block);
+ if (err != l1_table_block) {
+ err = -errno;
+ goto out;
+ }
+ }
+
+ /* check the L1 table checksum */
+ if (cksum != gen_cksum((char *)s->l1_table,
+ s->l1_size * sizeof(uint64_t)))
+ DPRINTF("qcow: bad L1 checksum\n");
+ else {
+ s->extended = 1;
+ s->sparse = (be32_to_cpu(exthdr->flags) & SPARSE_FILE);
+ s->min_cluster_alloc =
+ be32_to_cpu(exthdr->min_cluster_alloc);
+ }
+ }
+
+ /* convert L1 table to native endian for operation */
+ for (i = 0; i < s->l1_size; i++)
+ be64_to_cpus(&s->l1_table[i]);
+
+ err = 0;
+
+out:
+ if (err) {
+ free(buf);
+ free(s->l1_table);
+ s->l1_table = NULL;
+ }
+ return err;
+}
+
+/* Open the disk file and initialize qcow state. */
+int tdqcow_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+ int fd, len, i, ret, size, o_flags;
+ td_disk_info_t *bs = &(driver->info);
+ struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+ QCowHeader header;
+ uint64_t final_cluster = 0;
+
+ DPRINTF("QCOW: Opening %s\n", name);
+
+ o_flags = O_DIRECT | O_LARGEFILE |
+ ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+ fd = open(name, o_flags);
+ if (fd < 0) {
+ DPRINTF("Unable to open %s (%d)\n", name, -errno);
+ return -1;
+ }
+
+ s->fd = fd;
+ s->name = strdup(name);
+ if (!s->name)
+ goto fail;
+
+ if (tdqcow_read_header(fd, &header))
+ goto fail;
+
+ if (header.magic != QCOW_MAGIC)
+ goto fail;
+
+ switch (header.version) {
+ case QCOW_VERSION:
+ break;
+ case 2:
+ //TODO: Port qcow2 to new blktap framework.
+ // close(fd);
+ // dd->drv = &tapdisk_qcow2;
+ // return dd->drv->td_open(dd, name, flags);
+ goto fail;
+ default:
+ goto fail;
+ }
+
+ if (header.size <= 1 || header.cluster_bits < 9)
+ goto fail;
+ if (header.crypt_method > QCOW_CRYPT_AES)
+ goto fail;
+ s->crypt_method_header = header.crypt_method;
+ if (s->crypt_method_header)
+ s->encrypted = 1;
+ s->cluster_bits = header.cluster_bits;
+ s->cluster_size = 1 << s->cluster_bits;
+ s->cluster_sectors = 1 << (s->cluster_bits - 9);
+ s->l2_bits = header.l2_bits;
+ s->l2_size = 1 << s->l2_bits;
+ s->cluster_alloc = s->l2_size;
+ bs->size = header.size / 512;
+ s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+ s->backing_file_offset = header.backing_file_offset;
+ s->backing_file_size = header.backing_file_size;
+
+ /* allocate and load l1 table */
+ if (tdqcow_load_l1_table(s, &header))
+ goto fail;
+
+ /* alloc L2 cache */
+ size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
+ ret = posix_memalign((void **)&s->l2_cache, 4096, size);
+ if(ret != 0) goto fail;
+
+ size = s->cluster_size;
+ ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
+ if(ret != 0) goto fail;
+
+ ret = posix_memalign((void **)&s->cluster_data, 4096, size);
+ if(ret != 0) goto fail;
+ s->cluster_cache_offset = -1;
+
+ if (s->backing_file_offset != 0)
+ s->cluster_alloc = 1; /*Cannot use pre-alloc*/
+
+ bs->sector_size = 512;
+ bs->info = 0;
+
+ for(i = 0; i < s->l1_size; i++)
+ if (s->l1_table[i] > final_cluster)
+ final_cluster = s->l1_table[i];
+
+ if (init_aio_state(driver)!=0) {
+ DPRINTF("Unable to initialise AIO state\n");
+ free_aio_state(s);
+ goto fail;
+ }
+
+ if (!final_cluster)
+ s->fd_end = s->l1_table_offset +
+ ((s->l1_size * sizeof(uint64_t) + 4095) & ~4095);
+ else {
+ s->fd_end = lseek(fd, 0, SEEK_END);
+ if (s->fd_end == (off_t)-1)
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ DPRINTF("QCOW Open failed\n");
+
+ free_aio_state(s);
+ free(s->l1_table);
+ free(s->l2_cache);
+ free(s->cluster_cache);
+ free(s->cluster_data);
+ close(fd);
+ return -1;
+}
+
+void tdqcow_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+ int ret = 0, index_in_cluster, n, i;
+ uint64_t cluster_offset, sector, nb_sectors;
+ struct qcow_prv* prv;
+ td_request_t clone = treq;
+ char* buf = treq.buf;
+
+ sector = treq.sec;
+ nb_sectors = treq.secs;
+
+ /*We store a local record of the request*/
+ while (nb_sectors > 0) {
+ cluster_offset =
+ get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
+ index_in_cluster = sector & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+
+ if (s->aio_free_count == 0) {
+ td_complete_request(treq, -EBUSY);
+ return;
+ }
+
+ if(!cluster_offset) {
+ int i;
+ /* Forward entire request if possible. */
+ for(i=0; i<nb_sectors; i++)
+ if(get_cluster_offset(s, (sector+i) << 9, 0, 0, 0, 0))
+ goto coalesce_failed;
+ treq.buf = buf;
+ treq.sec = sector;
+ treq.secs = nb_sectors;
+ td_forward_request(treq);
+ return;
+coalesce_failed:
+ treq.buf = buf;
+ treq.sec = sector;
+ treq.secs = n;
+ td_forward_request(treq);
+
+ } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ if (decompress_cluster(s, cluster_offset) < 0) {
+ td_complete_request(treq, -EIO);
+ goto done;
+ }
+ memcpy(buf, s->cluster_cache + index_in_cluster * 512,
+ 512 * n);
+
+ treq.buf = buf;
+ treq.sec = sector;
+ treq.secs = n;
+ td_complete_request(treq, 0);
+ } else {
+ clone.buf = buf;
+ clone.sec = (cluster_offset>>9)+index_in_cluster;
+ clone.secs = n;
+ async_read(driver, clone);
+ }
+ nb_sectors -= n;
+ sector += n;
+ buf += n * 512;
+ }
+done:
+ return;
+}
+
+void tdqcow_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+ int ret = 0, index_in_cluster, n, i;
+ uint64_t cluster_offset, sector, nb_sectors;
+ td_callback_t cb;
+ struct qcow_prv* prv;
+ char* buf = treq.buf;
+ td_request_t clone=treq;
+
+ sector = treq.sec;
+ nb_sectors = treq.secs;
+
+ /*We store a local record of the request*/
+ while (nb_sectors > 0) {
+ index_in_cluster = sector & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+
+ if (s->aio_free_count == 0) {
+ td_complete_request(treq, -EBUSY);
+ return;
+ }
+
+ cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
+ index_in_cluster,
+ index_in_cluster+n);
+ if (!cluster_offset) {
+ DPRINTF("Ooops, no write cluster offset!\n");
+ td_complete_request(treq, -EIO);
+ return;
+ }
+
+ if (s->crypt_method) {
+ encrypt_sectors(s, sector, s->cluster_data,
+ (unsigned char *)buf, n, 1,
+ &s->aes_encrypt_key);
+
+ clone.buf = buf;
+ clone.sec = (cluster_offset>>9) + index_in_cluster;
+ clone.secs = n;
+ async_write(driver, clone);
+ } else {
+ clone.buf = buf;
+ clone.sec = (cluster_offset>>9) + index_in_cluster;
+ clone.secs = n;
+
+ async_write(driver, clone);
+ }
+
+ nb_sectors -= n;
+ sector += n;
+ buf += n * 512;
+ }
+ s->cluster_cache_offset = -1; /* disable compressed cache */
+
+ return;
+}
+
+static int
+tdqcow_update_checksum(struct tdqcow_state *s)
+{
+ int i, fd, err;
+ uint32_t offset, cksum, out;
+
+ if (!s->extended)
+ return 0;
+
+ fd = open(s->name, O_WRONLY | O_LARGEFILE); /* open without O_DIRECT */
+ if (fd == -1) {
+ err = errno;
+ goto out;
+ }
+
+ offset = sizeof(QCowHeader) + offsetof(QCowHeader_ext, cksum);
+ if (lseek(fd, offset, SEEK_SET) == (off_t)-1) {
+ err = errno;
+ goto out;
+ }
+
+ /* convert to big endian for checksum */
+ for (i = 0; i < s->l1_size; i++)
+ cpu_to_be64s(&s->l1_table[i]);
+
+ cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
+
+ /* and back again... */
+ for (i = 0; i < s->l1_size; i++)
+ be64_to_cpus(&s->l1_table[i]);
+
+ DPRINTF("Writing cksum: %d", cksum);
+
+ out = cpu_to_be32(cksum);
+ if (write(fd, &out, sizeof(out)) != sizeof(out)) {
+ err = errno;
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ if (err)
+ DPRINTF("failed to update checksum: %d\n", err);
+ if (fd != -1)
+ close(fd);
+ return err;
+}
+
+int tdqcow_close(td_driver_t *driver)
+{
+ struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+
+ /*Update the hdr cksum*/
+ tdqcow_update_checksum(s);
+
+ free_aio_state(s);
+ free(s->name);
+ free(s->l1_table);
+ free(s->l2_cache);
+ free(s->cluster_cache);
+ free(s->cluster_data);
+ close(s->fd);
+ return 0;
+}
+
+int qcow_create(const char *filename, uint64_t total_size,
+ const char *backing_file, int sparse)
+{
+ int fd, header_size, backing_filename_len, l1_size, i;
+ int shift, length, adjust, flags = 0, ret = 0;
+ QCowHeader header;
+ QCowHeader_ext exthdr;
+ char backing_filename[PATH_MAX], *ptr;
+ uint64_t tmp, size, total_length;
+ struct stat st;
+
+ DPRINTF("Qcow_create: size %"PRIu64"\n",total_size);
+
+ fd = open(filename,
+ O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+ 0644);
+ if (fd < 0)
+ return -1;
+
+ memset(&header, 0, sizeof(header));
+ header.magic = cpu_to_be32(QCOW_MAGIC);
+ header.version = cpu_to_be32(QCOW_VERSION);
+
+ /*Create extended header fields*/
+ exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
+
+ header_size = sizeof(header) + sizeof(QCowHeader_ext);
+ backing_filename_len = 0;
+ size = (total_size >> SECTOR_SHIFT);
+ if (backing_file) {
+ if (strcmp(backing_file, "fat:")) {
+ const char *p;
+ /* XXX: this is a hack: we do not attempt to
+ *check for URL like syntax */
+ p = strchr(backing_file, ':');
+ if (p && (p - backing_file) >= 2) {
+ /* URL like but exclude "c:" like filenames */
+ strncpy(backing_filename, backing_file,
+ sizeof(backing_filename));
+ } else {
+ if (realpath(backing_file, backing_filename) == NULL ||
+ stat(backing_filename, &st) != 0) {
+ return -1;
+ }
+ }
+ header.backing_file_offset = cpu_to_be64(header_size);
+ backing_filename_len = strlen(backing_filename);
+ header.backing_file_size = cpu_to_be32(
+ backing_filename_len);
+ header_size += backing_filename_len;
+
+ /*Set to the backing file size*/
+ if(get_filesize(backing_filename, &size, &st)) {
+ return -1;
+ }
+ DPRINTF("Backing file size detected: %"PRId64" sectors"
+ "(total %"PRId64" [%"PRId64" MB])\n",
+ size,
+ (uint64_t)(size << SECTOR_SHIFT),
+ (uint64_t)(size >> 11));
+ } else {
+ backing_file = NULL;
+ DPRINTF("Setting file size: %"PRId64" (total %"PRId64")\n",
+ total_size,
+ (uint64_t) (total_size << SECTOR_SHIFT));
+ }
+ header.mtime = cpu_to_be32(st.st_mtime);
+ header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+ unmodifyed sectors */
+ header.l2_bits = 12; /* 32 KB L2 tables */
+ exthdr.min_cluster_alloc = cpu_to_be32(1);
+ } else {
+ DPRINTF("Setting file size: %"PRId64" sectors"
+ "(total %"PRId64" [%"PRId64" MB])\n",
+ size,
+ (uint64_t) (size << SECTOR_SHIFT),
+ (uint64_t) (size >> 11));
+ header.cluster_bits = 12; /* 4 KB clusters */
+ header.l2_bits = 9; /* 4 KB L2 tables */
+ exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
+ }
+ /*Set the header size value*/
+ header.size = cpu_to_be64(size * 512);
+
+ header_size = (header_size + 7) & ~7;
+ if (header_size % 4096 > 0) {
+ header_size = ((header_size >> 12) + 1) << 12;
+ }
+
+ shift = header.cluster_bits + header.l2_bits;
+ l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
+
+ header.l1_table_offset = cpu_to_be64(header_size);
+ DPRINTF("L1 Table offset: %d, size %d\n",
+ header_size,
+ (int)(l1_size * sizeof(uint64_t)));
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+
+ ptr = calloc(1, l1_size * sizeof(uint64_t));
+ exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
+ printf("Created cksum: %d\n",exthdr.cksum);
+ free(ptr);
+
+ /*adjust file length to system page size boundary*/
+ length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
+ getpagesize());
+ if (qtruncate(fd, length, 0)!=0) {
+ DPRINTF("ERROR truncating file\n");
+ return -1;
+ }
+
+ if (sparse == 0) {
+ /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
+ total_length = length + (l1_size * (1 << 9)) + (size * 512);
+ if (qtruncate(fd, total_length, 0)!=0) {
+ DPRINTF("ERROR truncating file\n");
+ return -1;
+ }
+ printf("File truncated to length %"PRIu64"\n",total_length);
+ } else
+ flags = SPARSE_FILE;
+
+ flags |= EXTHDR_L1_BIG_ENDIAN;
+ exthdr.flags = cpu_to_be32(flags);
+
+ /* write all the data */
+ lseek(fd, 0, SEEK_SET);
+ ret += write(fd, &header, sizeof(header));
+ ret += write(fd, &exthdr, sizeof(exthdr));
+ if (backing_file)
+ ret += write(fd, backing_filename, backing_filename_len);
+
+ lseek(fd, header_size, SEEK_SET);
+ tmp = 0;
+ for (i = 0;i < l1_size; i++) {
+ ret += write(fd, &tmp, sizeof(tmp));
+ }
+
+ close(fd);
+
+ return 0;
+}
+
+static int qcow_make_empty(struct tdqcow_state *s)
+{
+ uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+
+ memset(s->l1_table, 0, l1_length);
+ lseek(s->fd, s->l1_table_offset, SEEK_SET);
+ if (write(s->fd, s->l1_table, l1_length) < 0)
+ return -1;
+ if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
+ DPRINTF("ERROR truncating file\n");
+ return -1;
+ }
+
+ memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+ return 0;
+}
+
+static int qcow_get_cluster_size(struct tdqcow_state *s)
+{
+ return s->cluster_size;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+ tables to avoid losing bytes in alignment */
+static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num,
+ const uint8_t *buf)
+{
+ z_stream strm;
+ int ret, out_len;
+ uint8_t *out_buf;
+ uint64_t cluster_offset;
+
+ out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+ if (!out_buf)
+ return -1;
+
+ /* best compression, small window, no zlib header */
+ memset(&strm, 0, sizeof(strm));
+ ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -12,
+ 9, Z_DEFAULT_STRATEGY);
+ if (ret != 0) {
+ free(out_buf);
+ return -1;
+ }
+
+ strm.avail_in = s->cluster_size;
+ strm.next_in = (uint8_t *)buf;
+ strm.avail_out = s->cluster_size;
+ strm.next_out = out_buf;
+
+ ret = deflate(&strm, Z_FINISH);
+ if (ret != Z_STREAM_END && ret != Z_OK) {
+ free(out_buf);
+ deflateEnd(&strm);
+ return -1;
+ }
+ out_len = strm.next_out - out_buf;
+
+ deflateEnd(&strm);
+
+ if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+ /* could not compress: write normal cluster */
+ //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
+ } else {
+ cluster_offset = get_cluster_offset(s, sector_num << 9, 2,
+ out_len, 0, 0);
+ cluster_offset &= s->cluster_offset_mask;
+ lseek(s->fd, cluster_offset, SEEK_SET);
+ if (write(s->fd, out_buf, out_len) != out_len) {
+ free(out_buf);
+ return -1;
+ }
+ }
+
+ free(out_buf);
+ return 0;
+}
+
+static int
+tdqcow_get_image_type(const char *file, int *type)
+{
+ int fd;
+ size_t size;
+ QCowHeader header;
+
+ fd = open(file, O_RDONLY);
+ if (fd == -1)
+ return -errno;
+
+ size = read(fd, &header, sizeof(header));
+ close(fd);
+ if (size != sizeof(header))
+ return (errno ? -errno : -EIO);
+
+ be32_to_cpus(&header.magic);
+ if (header.magic == QCOW_MAGIC)
+ *type = DISK_TYPE_QCOW;
+ else
+ *type = DISK_TYPE_AIO;
+
+ return 0;
+}
+
+int tdqcow_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ off_t off;
+ char *buf, *filename;
+ int len, secs, type = 0, err = -EINVAL;
+ struct tdqcow_state *child = (struct tdqcow_state *)driver->data;
+
+ if (!child->backing_file_offset)
+ return TD_NO_PARENT;
+
+ /* read the backing file name */
+ len = child->backing_file_size;
+ off = child->backing_file_offset - (child->backing_file_offset % 512);
+ secs = (len + (child->backing_file_offset - off) + 511) >> 9;
+
+ if (posix_memalign((void **)&buf, 512, secs << 9))
+ return -1;
+
+ if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
+ goto out;
+
+ if (read(child->fd, buf, secs << 9) != secs << 9)
+ goto out;
+ filename = buf + (child->backing_file_offset - off);
+ filename[len] = '\0';
+
+ if (tdqcow_get_image_type(filename, &type))
+ goto out;
+
+ id->name = strdup(filename);
+ id->drivertype = type;
+ err = 0;
+ out:
+ free(buf);
+ return err;
+}
+
+int tdqcow_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ struct stat stats;
+ uint64_t psize, csize;
+ struct tdqcow_state *c = (struct tdqcow_state *)driver->data;
+ struct tdqcow_state *p = (struct tdqcow_state *)pdriver->data;
+
+ if (stat(p->name, &stats))
+ return -EINVAL;
+ if (get_filesize(p->name, &psize, &stats))
+ return -EINVAL;
+
+ if (stat(c->name, &stats))
+ return -EINVAL;
+ if (get_filesize(c->name, &csize, &stats))
+ return -EINVAL;
+
+ if (csize != psize)
+ return -EINVAL;
+
+ return 0;
+}
+
+struct tap_disk tapdisk_qcow = {
+ .disk_type = "tapdisk_qcow",
+ .flags = 0,
+ .private_data_size = sizeof(struct tdqcow_state),
+ .td_open = tdqcow_open,
+ .td_close = tdqcow_close,
+ .td_queue_read = tdqcow_queue_read,
+ .td_queue_write = tdqcow_queue_write,
+ .td_get_parent_id = tdqcow_get_parent_id,
+ .td_validate_parent = tdqcow_validate_parent,
+ .td_debug = NULL,
+};
--- /dev/null
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+
+#include "blk.h"
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+char *img;
+long int disksector_size;
+long int disksize;
+long int diskinfo;
+static int connections = 0;
+
+struct tdram_state {
+ int fd;
+};
+
+/*Get Image size, secsize*/
+static int get_image_info(int fd, td_disk_info_t *info)
+{
+ int ret;
+ long size;
+ unsigned long total_size;
+ struct statvfs statBuf;
+ struct stat stat;
+
+ ret = fstat(fd, &stat);
+ if (ret != 0) {
+ DPRINTF("ERROR: fstat failed, Couldn't stat image");
+ return -EINVAL;
+ }
+
+ if (S_ISBLK(stat.st_mode)) {
+ /*Accessing block device directly*/
+ info->size = 0;
+ if (blk_getimagesize(fd, &info->size) != 0)
+ return -EINVAL;
+
+ DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
+ "sector_shift [%llu]\n",
+ (long long unsigned)(info->size << SECTOR_SHIFT),
+ (long long unsigned)info->size);
+
+ /*Get the sector size*/
+ if (blk_getsectorsize(fd, &info->sector_size) != 0)
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+
+ } else {
+ /*Local file? try fstat instead*/
+ info->size = (stat.st_size >> SECTOR_SHIFT);
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+ DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
+ "sector_shift [%llu]\n",
+ (long long unsigned)(info->size << SECTOR_SHIFT),
+ (long long unsigned)info->size);
+ }
+
+ if (info->size == 0) {
+ info->size =((uint64_t) MAX_RAMDISK_SIZE);
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+ }
+ info->info = 0;
+
+ /*Store variables locally*/
+ disksector_size = info->sector_size;
+ disksize = info->size;
+ diskinfo = info->info;
+ DPRINTF("Image sector_size: \n\t[%"PRIu64"]\n",
+ info->sector_size);
+
+ return 0;
+}
+
+/* Open the disk file and initialize ram state. */
+int tdram_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+ char *p;
+ uint64_t size;
+ int i, fd, ret = 0, count = 0, o_flags;
+ struct tdram_state *prv = (struct tdram_state *)driver->data;
+
+ connections++;
+
+ if (connections > 1) {
+ driver->info.sector_size = disksector_size;
+ driver->info.size = disksize;
+ driver->info.info = diskinfo;
+ DPRINTF("Image already open, returning parameters:\n");
+ DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
+ "sector_shift [%llu]\n",
+ (long long unsigned)(driver->info.size << SECTOR_SHIFT),
+ (long long unsigned)driver->info.size);
+ DPRINTF("Image sector_size: \n\t[%"PRIu64"]\n",
+ driver->info.sector_size);
+
+ prv->fd = -1;
+ goto done;
+ }
+
+ /* Open the file */
+ o_flags = O_DIRECT | O_LARGEFILE |
+ ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+ fd = open(name, o_flags);
+
+ if ((fd == -1) && (errno == EINVAL)) {
+
+ /* Maybe O_DIRECT isn't supported. */
+ o_flags &= ~O_DIRECT;
+ fd = open(name, o_flags);
+ if (fd != -1) DPRINTF("WARNING: Accessing image without"
+ "O_DIRECT! (%s)\n", name);
+
+ } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+
+ if (fd == -1) {
+ DPRINTF("Unable to open [%s]!\n",name);
+ ret = 0 - errno;
+ goto done;
+ }
+
+ prv->fd = fd;
+
+ ret = get_image_info(fd, &driver->info);
+ size = MAX_RAMDISK_SIZE;
+
+ if (driver->info.size > size) {
+ DPRINTF("Disk exceeds limit, must be less than [%d]MB",
+ (MAX_RAMDISK_SIZE<<SECTOR_SHIFT)>>20);
+ return -ENOMEM;
+ }
+
+ /*Read the image into memory*/
+ if (posix_memalign((void **)&img,
+ DEFAULT_SECTOR_SIZE,
+ driver->info.size << SECTOR_SHIFT)) {
+ DPRINTF("Mem malloc failed\n");
+ return -errno;
+ }
+ p = img;
+ DPRINTF("Reading %llu bytes.......",
+ (long long unsigned)driver->info.size << SECTOR_SHIFT);
+
+ for (i = 0; i < driver->info.size; i++) {
+ ret = read(prv->fd, p, driver->info.sector_size);
+ if (ret != driver->info.sector_size) {
+ DPRINTF("ret = %d, errno = %d\n", ret, errno);
+ ret = 0 - errno;
+ break;
+ } else {
+ count += ret;
+ p = img + count;
+ }
+ }
+ DPRINTF("[%d]\n",count);
+ if (count != driver->info.size << SECTOR_SHIFT) {
+ ret = -1;
+ } else {
+ ret = 0;
+ }
+
+done:
+ return ret;
+}
+
+void tdram_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct tdram_state *prv = (struct tdram_state *)driver->data;
+ int size = treq.secs * driver->info.sector_size;
+ uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ memcpy(treq.buf, img + offset, size);
+
+ td_complete_request(treq, 0);
+}
+
+void tdram_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdram_state *prv = (struct tdram_state *)driver->data;
+ int size = treq.secs * driver->info.sector_size;
+ uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ /* We assume that write access is controlled
+ * at a higher level for multiple disks */
+ memcpy(img + offset, treq.buf, size);
+
+ td_complete_request(treq, 0);
+}
+
+int tdram_close(td_driver_t *driver)
+{
+ struct tdram_state *prv = (struct tdram_state *)driver->data;
+
+ connections--;
+
+ return 0;
+}
+
+int tdram_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ return TD_NO_PARENT;
+}
+
+int tdram_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ return -EINVAL;
+}
+
+struct tap_disk tapdisk_ram = {
+ .disk_type = "tapdisk_ram",
+ .flags = 0,
+ .private_data_size = sizeof(struct tdram_state),
+ .td_open = tdram_open,
+ .td_close = tdram_close,
+ .td_queue_read = tdram_queue_read,
+ .td_queue_write = tdram_queue_write,
+ .td_get_parent_id = tdram_get_parent_id,
+ .td_validate_parent = tdram_validate_parent,
+ .td_debug = NULL,
+};
--- /dev/null
+/* block-remus.c
+ *
+ * This disk sends all writes to a backup via a network interface before
+ * passing them to an underlying device.
+ * The backup is a bit more complicated:
+ * 1. It applies all incoming writes to a ramdisk.
+ * 2. When a checkpoint request arrives, it moves the ramdisk to
+ * a committing state and uses a new ramdisk for subsequent writes.
+ * It also acknowledges the request, to let the sender know it can
+ * release output.
+ * 3. The ramdisk flushes its contents to the underlying driver.
+ * 4. At failover, the backup waits for the in-flight ramdisk (if any) to
+ * drain before letting the domain be activated.
+ *
+ * The driver determines whether it is the client or server by attempting
+ * to bind to the replication address. If the address is not local,
+ * the driver acts as client.
+ *
+ * The following messages are defined for the replication stream:
+ * 1. write request
+ * "wreq" 4
+ * num_sectors 4
+ * sector 8
+ * buffer (num_sectors * sector_size)
+ * 2. submit request (may be used as a barrier
+ * "sreq" 4
+ * 3. commit request
+ * "creq" 4
+ * After a commit request, the client must wait for a competion message:
+ * 4. completion
+ * "done" 4
+ */
+
+/* due to architectural choices in tapdisk, block-buffer is forced to
+ * reimplement some code which is meant to be private */
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "hashtable.h"
+#include "hashtable_itr.h"
+#include "hashtable_utility.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+/* timeout for reads and writes in ms */
+#define HEARTBEAT_MS 1000
+#define RAMDISK_HASHSIZE 128
+
+/* connect retry timeout (seconds) */
+#define REMUS_CONNRETRY_TIMEOUT 10
+
+#define RPRINTF(_f, _a...) syslog (LOG_DEBUG, "remus: " _f, ## _a)
+
+enum tdremus_mode {
+ mode_invalid = 0,
+ mode_unprotected,
+ mode_primary,
+ mode_backup
+};
+
+struct tdremus_req {
+ uint64_t sector;
+ int nb_sectors;
+ char buf[4096];
+};
+
+struct req_ring {
+ /* waste one slot to distinguish between empty and full */
+ struct tdremus_req requests[MAX_REQUESTS * 2 + 1];
+ unsigned int head;
+ unsigned int tail;
+};
+
+/* TODO: This isn't very pretty, but to properly generate our own treqs (needed
+ * by the backup) we need to know our td_vbt_t and td_image_t (blktap2
+ * internals). As a proper fix, we should consider extending the tapdisk
+ * interface with a td_create_request() function, or something similar.
+ *
+ * For now, we just grab the vbd in the td_open() command, and the td_image_t
+ * from the first read request.
+ */
+td_vbd_t *device_vbd = NULL;
+td_image_t *remus_image = NULL;
+struct tap_disk tapdisk_remus;
+
+struct ramdisk {
+ size_t sector_size;
+ struct hashtable* h;
+ /* when a ramdisk is flushed, h is given a new empty hash for writes
+ * while the old ramdisk (prev) is drained asynchronously.
+ */
+ struct hashtable* prev;
+ /* count of outstanding requests to the base driver */
+ size_t inflight;
+ /* prev holds the requests to be flushed, while inprogress holds
+ * requests being flushed. When requests complete, they are removed
+ * from inprogress.
+ * Whenever a new flush is merged with ongoing flush (i.e, prev),
+ * we have to make sure that none of the new requests overlap with
+ * ones in "inprogress". If it does, keep it back in prev and dont issue
+ * IO until the current one finishes. If we allow this IO to proceed,
+ * we might end up with two "overlapping" requests in the disk's queue and
+ * the disk may not offer any guarantee on which one is written first.
+ * IOW, make sure we dont create a write-after-write time ordering constraint.
+ *
+ */
+ struct hashtable* inprogress;
+};
+
+/* the ramdisk intercepts the original callback for reads and writes.
+ * This holds the original data. */
+/* Might be worth making this a static array in struct ramdisk to avoid
+ * a malloc per request */
+
+struct tdremus_state;
+
+struct ramdisk_cbdata {
+ td_callback_t cb;
+ void* private;
+ char* buf;
+ struct tdremus_state* state;
+};
+
+struct ramdisk_write_cbdata {
+ struct tdremus_state* state;
+ char* buf;
+};
+
+typedef void (*queue_rw_t) (td_driver_t *driver, td_request_t treq);
+
+/* poll_fd type for blktap2 fd system. taken from block_log.c */
+typedef struct poll_fd {
+ int fd;
+ event_id_t id;
+} poll_fd_t;
+
+struct tdremus_state {
+// struct tap_disk* driver;
+ void* driver_data;
+
+ /* XXX: this is needed so that the server can perform operations on
+ * the driver from the stream_fd event handler. fix this. */
+ td_driver_t *tdremus_driver;
+
+ /* TODO: we may wish to replace these two FIFOs with a unix socket */
+ char* ctl_path; /* receive flush instruction here */
+ poll_fd_t ctl_fd; /* io_fd slot for control FIFO */
+ char* msg_path; /* output completion message here */
+ poll_fd_t msg_fd;
+
+ /* replication host */
+ struct sockaddr_in sa;
+ poll_fd_t server_fd; /* server listen port */
+ poll_fd_t stream_fd; /* replication channel */
+
+ /* queue write requests, batch-replicate at submit */
+ struct req_ring write_ring;
+
+ /* ramdisk data*/
+ struct ramdisk ramdisk;
+
+ /* mode methods */
+ enum tdremus_mode mode;
+ int (*queue_flush)(td_driver_t *driver);
+};
+
+typedef struct tdremus_wire {
+ uint32_t op;
+ uint64_t id;
+ uint64_t sec;
+ uint32_t secs;
+} tdremus_wire_t;
+
+#define TDREMUS_READ "rreq"
+#define TDREMUS_WRITE "wreq"
+#define TDREMUS_SUBMIT "sreq"
+#define TDREMUS_COMMIT "creq"
+#define TDREMUS_DONE "done"
+#define TDREMUS_FAIL "fail"
+
+/* primary read/write functions */
+static void primary_queue_read(td_driver_t *driver, td_request_t treq);
+static void primary_queue_write(td_driver_t *driver, td_request_t treq);
+
+/* backup read/write functions */
+static void backup_queue_read(td_driver_t *driver, td_request_t treq);
+static void backup_queue_write(td_driver_t *driver, td_request_t treq);
+
+/* unpritected read/write functions */
+static void unprotected_queue_read(td_driver_t *driver, td_request_t treq);
+static void unprotected_queue_write(td_driver_t *driver, td_request_t treq);
+
+static int tdremus_close(td_driver_t *driver);
+
+static int switch_mode(td_driver_t *driver, enum tdremus_mode mode);
+static int ctl_respond(struct tdremus_state *s, const char *response);
+
+/* ring functions */
+static inline unsigned int ring_next(struct req_ring* ring, unsigned int pos)
+{
+ if (++pos >= MAX_REQUESTS * 2 + 1)
+ return 0;
+
+ return pos;
+}
+
+static inline int ring_isempty(struct req_ring* ring)
+{
+ return ring->head == ring->tail;
+}
+
+static inline int ring_isfull(struct req_ring* ring)
+{
+ return ring_next(ring, ring->tail) == ring->head;
+}
+/* Prototype declarations */
+static int ramdisk_flush(td_driver_t *driver, struct tdremus_state* s);
+
+/* functions to create and sumbit treq's */
+
+static void
+replicated_write_callback(td_request_t treq, int err)
+{
+ struct tdremus_state *s = (struct tdremus_state *) treq.cb_data;
+ td_vbd_request_t *vreq;
+ int i;
+ uint64_t start;
+ vreq = (td_vbd_request_t *) treq.private;
+
+ /* the write failed for now, lets panic. this is very bad */
+ if (err) {
+ RPRINTF("ramdisk write failed, disk image is not consistent\n");
+ exit(-1);
+ }
+
+ /* The write succeeded. let's pull the vreq off whatever request list
+ * it is on and free() it */
+ list_del(&vreq->next);
+ free(vreq);
+
+ s->ramdisk.inflight--;
+ start = treq.sec;
+ for (i = 0; i < treq.secs; i++) {
+ hashtable_remove(s->ramdisk.inprogress, &start);
+ start++;
+ }
+ free(treq.buf);
+
+ if (!s->ramdisk.inflight && !s->ramdisk.prev) {
+ /* TODO: the ramdisk has been flushed */
+ }
+}
+
+static inline int
+create_write_request(struct tdremus_state *state, td_sector_t sec, int secs, char *buf)
+{
+ td_request_t treq;
+ td_vbd_request_t *vreq;
+
+ treq.op = TD_OP_WRITE;
+ treq.buf = buf;
+ treq.sec = sec;
+ treq.secs = secs;
+ treq.image = remus_image;
+ treq.cb = replicated_write_callback;
+ treq.cb_data = state;
+ treq.id = 0;
+ treq.sidx = 0;
+
+ vreq = calloc(1, sizeof(td_vbd_request_t));
+ treq.private = vreq;
+
+ if(!vreq)
+ return -1;
+
+ vreq->submitting = 1;
+ INIT_LIST_HEAD(&vreq->next);
+ tapdisk_vbd_move_request(treq.private, &device_vbd->pending_requests);
+
+ /* TODO:
+ * we should probably leave it up to the caller to forward the request */
+ td_forward_request(treq);
+
+ vreq->submitting--;
+
+ return 0;
+}
+
+
+/* http://www.concentric.net/~Ttwang/tech/inthash.htm */
+static unsigned int uint64_hash(void* k)
+{
+ uint64_t key = *(uint64_t*)k;
+
+ key = (~key) + (key << 18);
+ key = key ^ (key >> 31);
+ key = key * 21;
+ key = key ^ (key >> 11);
+ key = key + (key << 6);
+ key = key ^ (key >> 22);
+
+ return (unsigned int)key;
+}
+
+static int rd_hash_equal(void* k1, void* k2)
+{
+ uint64_t key1, key2;
+
+ key1 = *(uint64_t*)k1;
+ key2 = *(uint64_t*)k2;
+
+ return key1 == key2;
+}
+
+static int ramdisk_read(struct ramdisk* ramdisk, uint64_t sector,
+ int nb_sectors, char* buf)
+{
+ int i;
+ char* v;
+ uint64_t key;
+
+ for (i = 0; i < nb_sectors; i++) {
+ key = sector + i;
+ /* check whether it is queued in a previous flush request */
+ if (!(ramdisk->prev && (v = hashtable_search(ramdisk->prev, &key)))) {
+ /* check whether it is an ongoing flush */
+ if (!(ramdisk->inprogress && (v = hashtable_search(ramdisk->inprogress, &key))))
+ return -1;
+ }
+ memcpy(buf + i * ramdisk->sector_size, v, ramdisk->sector_size);
+ }
+
+ return 0;
+}
+
+static int ramdisk_write_hash(struct hashtable* h, uint64_t sector, char* buf,
+ size_t len)
+{
+ char* v;
+ uint64_t* key;
+
+ if ((v = hashtable_search(h, §or))) {
+ memcpy(v, buf, len);
+ return 0;
+ }
+
+ if (!(v = malloc(len))) {
+ DPRINTF("ramdisk_write_hash: malloc failed\n");
+ return -1;
+ }
+ memcpy(v, buf, len);
+ if (!(key = malloc(sizeof(*key)))) {
+ DPRINTF("ramdisk_write_hash: error allocating key\n");
+ free(v);
+ return -1;
+ }
+ *key = sector;
+ if (!hashtable_insert(h, key, v)) {
+ DPRINTF("ramdisk_write_hash failed on sector %" PRIu64 "\n", sector);
+ free(key);
+ free(v);
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int ramdisk_write(struct ramdisk* ramdisk, uint64_t sector,
+ int nb_sectors, char* buf)
+{
+ int i, rc;
+
+ for (i = 0; i < nb_sectors; i++) {
+ rc = ramdisk_write_hash(ramdisk->h, sector + i,
+ buf + i * ramdisk->sector_size,
+ ramdisk->sector_size);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int uint64_compare(const void* k1, const void* k2)
+{
+ uint64_t u1 = *(uint64_t*)k1;
+ uint64_t u2 = *(uint64_t*)k2;
+
+ /* u1 - u2 is unsigned */
+ return u1 < u2 ? -1 : u1 > u2 ? 1 : 0;
+}
+
+/* set psectors to an array of the sector numbers in the hash, returning
+ * the number of entries (or -1 on error) */
+static int ramdisk_get_sectors(struct hashtable* h, uint64_t** psectors)
+{
+ struct hashtable_itr* itr;
+ uint64_t* sectors;
+ int count;
+
+ if (!(count = hashtable_count(h)))
+ return 0;
+
+ if (!(*psectors = malloc(count * sizeof(uint64_t)))) {
+ DPRINTF("ramdisk_get_sectors: error allocating sector map\n");
+ return -1;
+ }
+ sectors = *psectors;
+
+ itr = hashtable_iterator(h);
+ count = 0;
+ do {
+ sectors[count++] = *(uint64_t*)hashtable_iterator_key(itr);
+ } while (hashtable_iterator_advance(itr));
+ free(itr);
+
+ return count;
+}
+
+/*
+ return -1 for OOM
+ return -2 for merge lookup failure
+ return -3 for WAW race
+ return 0 on success.
+*/
+static int merge_requests(struct ramdisk* ramdisk, uint64_t start,
+ size_t count, char **mergedbuf)
+{
+ char* buf;
+ char* sector;
+ int i;
+ uint64_t *key;
+ int rc = 0;
+
+ if (!(buf = valloc(count * ramdisk->sector_size))) {
+ DPRINTF("merge_request: allocation failed\n");
+ return -1;
+ }
+
+ for (i = 0; i < count; i++) {
+ if (!(sector = hashtable_search(ramdisk->prev, &start))) {
+ DPRINTF("merge_request: lookup failed on %"PRIu64"\n", start);
+ free(buf);
+ rc = -2;
+ goto fail;
+ }
+
+ /* Check inprogress requests to avoid waw non-determinism */
+ if (hashtable_search(ramdisk->inprogress, &start)) {
+ DPRINTF("merge_request: WAR RACE on %"PRIu64"\n", start);
+ free(buf);
+ rc = -3;
+ goto fail;
+ }
+ /* Insert req into inprogress (brief period of duplication of hash entries until
+ * they are removed from prev. Read tracking would not be reading wrong entries)
+ */
+ if (!(key = malloc(sizeof(*key)))) {
+ DPRINTF("%s: error allocating key\n", __FUNCTION__);
+ free(buf);
+ rc = -1;
+ goto fail;
+ }
+ *key = start;
+ if (!hashtable_insert(ramdisk->inprogress, key, NULL)) {
+ DPRINTF("%s failed to insert sector %" PRIu64 " into inprogress hash\n",
+ __FUNCTION__, start);
+ free(key);
+ free(buf);
+ rc = -1;
+ goto fail;
+ }
+ memcpy(buf + i * ramdisk->sector_size, sector, ramdisk->sector_size);
+ start++;
+ }
+
+ *mergedbuf = buf;
+ return 0;
+fail:
+ for (start--; i >0; i--, start--)
+ hashtable_remove(ramdisk->inprogress, &start);
+ return rc;
+}
+
+/* The underlying driver may not handle having the whole ramdisk queued at
+ * once. We queue what we can and let the callbacks attempt to queue more. */
+/* NOTE: may be called from callback, while dd->private still belongs to
+ * the underlying driver */
+static int ramdisk_flush(td_driver_t *driver, struct tdremus_state* s)
+{
+ uint64_t* sectors;
+ char* buf = NULL;
+ uint64_t base, batchlen;
+ int i, j, count = 0;
+
+ // RPRINTF("ramdisk flush\n");
+
+ if ((count = ramdisk_get_sectors(s->ramdisk.prev, §ors)) <= 0)
+ return count;
+
+ /* Create the inprogress table if empty */
+ if (!s->ramdisk.inprogress)
+ s->ramdisk.inprogress = create_hashtable(RAMDISK_HASHSIZE,
+ uint64_hash,
+ rd_hash_equal);
+
+ /*
+ RPRINTF("ramdisk: flushing %d sectors\n", count);
+ */
+
+ /* sort and merge sectors to improve disk performance */
+ qsort(sectors, count, sizeof(*sectors), uint64_compare);
+
+ for (i = 0; i < count;) {
+ base = sectors[i++];
+ while (i < count && sectors[i] == sectors[i-1] + 1)
+ i++;
+ batchlen = sectors[i-1] - base + 1;
+
+ j = merge_requests(&s->ramdisk, base, batchlen, &buf);
+
+ if (j) {
+ RPRINTF("ramdisk_flush: merge_requests failed:%s\n",
+ j == -1? "OOM": (j==-2? "missing sector" : "WAW race"));
+ if (j == -3) continue;
+ free(sectors);
+ return -1;
+ }
+
+ /* NOTE: create_write_request() creates a treq AND forwards it down
+ * the driver chain */
+ // RPRINTF("forwarding write request at %" PRIu64 ", length: %" PRIu64 "\n", base, batchlen);
+ create_write_request(s, base, batchlen, buf);
+ //RPRINTF("write request at %" PRIu64 ", length: %" PRIu64 " forwarded\n", base, batchlen);
+
+ s->ramdisk.inflight++;
+
+ for (j = 0; j < batchlen; j++) {
+ buf = hashtable_search(s->ramdisk.prev, &base);
+ free(buf);
+ hashtable_remove(s->ramdisk.prev, &base);
+ base++;
+ }
+ }
+
+ if (!hashtable_count(s->ramdisk.prev)) {
+ /* everything is in flight */
+ hashtable_destroy(s->ramdisk.prev, 0);
+ s->ramdisk.prev = NULL;
+ }
+
+ free(sectors);
+
+ // RPRINTF("ramdisk flush done\n");
+ return 0;
+}
+
+/* flush ramdisk contents to disk */
+static int ramdisk_start_flush(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ uint64_t* key;
+ char* buf;
+ int rc = 0;
+ int i, j, count, batchlen;
+ uint64_t* sectors;
+
+ if (!hashtable_count(s->ramdisk.h)) {
+ /*
+ RPRINTF("Nothing to flush\n");
+ */
+ return 0;
+ }
+
+ if (s->ramdisk.prev) {
+ /* a flush request issued while a previous flush is still in progress
+ * will merge with the previous request. If you want the previous
+ * request to be consistent, wait for it to complete. */
+ if ((count = ramdisk_get_sectors(s->ramdisk.h, §ors)) < 0)
+ return count;
+
+ for (i = 0; i < count; i++) {
+ buf = hashtable_search(s->ramdisk.h, sectors + i);
+ ramdisk_write_hash(s->ramdisk.prev, sectors[i], buf,
+ s->ramdisk.sector_size);
+ }
+ free(sectors);
+
+ hashtable_destroy (s->ramdisk.h, 0);
+ } else
+ s->ramdisk.prev = s->ramdisk.h;
+
+ /* We create a new hashtable so that new writes can be performed before
+ * the old hashtable is completely drained. */
+ s->ramdisk.h = create_hashtable(RAMDISK_HASHSIZE, uint64_hash,
+ rd_hash_equal);
+
+ return ramdisk_flush(driver, s);
+}
+
+
+static int ramdisk_start(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ if (s->ramdisk.h) {
+ RPRINTF("ramdisk already allocated\n");
+ return 0;
+ }
+
+ s->ramdisk.sector_size = driver->info.sector_size;
+ s->ramdisk.h = create_hashtable(RAMDISK_HASHSIZE, uint64_hash,
+ rd_hash_equal);
+
+ DPRINTF("Ramdisk started, %zu bytes/sector\n", s->ramdisk.sector_size);
+
+ return 0;
+}
+
+/* common client/server functions */
+/* mayberead: Time out after a certain interval. */
+static int mread(int fd, void* buf, size_t len)
+{
+ fd_set rfds;
+ int rc;
+ size_t cur = 0;
+ struct timeval tv = {
+ .tv_sec = HEARTBEAT_MS / 1000,
+ .tv_usec = (HEARTBEAT_MS % 1000) * 1000
+ };
+
+ if (!len)
+ return 0;
+
+ /* read first. Only select if read is incomplete. */
+ rc = read(fd, buf, len);
+ while (rc < 0 || cur + rc < len) {
+ if (!rc) {
+ RPRINTF("end-of-file");
+ return -1;
+ }
+ if (rc < 0 && errno != EAGAIN) {
+ RPRINTF("error during read: %s\n", strerror(errno));
+ return -1;
+ }
+ if (rc > 0)
+ cur += rc;
+
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+ if (!(rc = select(fd + 1, &rfds, NULL, NULL, &tv))) {
+ RPRINTF("time out during read\n");
+ return -1;
+ } else if (rc < 0) {
+ RPRINTF("error during select: %d\n", errno);
+ return -1;
+ }
+ rc = read(fd, buf + cur, len - cur);
+ }
+ /*
+ RPRINTF("read %d bytes\n", cur + rc);
+ */
+
+ return 0;
+}
+
+static int mwrite(int fd, void* buf, size_t len)
+{
+ fd_set wfds;
+ size_t cur = 0;
+ int rc;
+ struct timeval tv = {
+ .tv_sec = HEARTBEAT_MS / 1000,
+ .tv_usec = (HEARTBEAT_MS % 1000) * 1000
+ };
+
+ if (!len)
+ return 0;
+
+ /* read first. Only select if read is incomplete. */
+ rc = write(fd, buf, len);
+ while (rc < 0 || cur + rc < len) {
+ if (!rc) {
+ RPRINTF("end-of-file");
+ return -1;
+ }
+ if (rc < 0 && errno != EAGAIN) {
+ RPRINTF("error during write: %s\n", strerror(errno));
+ return -1;
+ }
+ if (rc > 0)
+ cur += rc;
+
+ FD_ZERO(&wfds);
+ FD_SET(fd, &wfds);
+ if (!(rc = select(fd + 1, NULL, &wfds, NULL, &tv))) {
+ RPRINTF("time out during write\n");
+ return -1;
+ } else if (rc < 0) {
+ RPRINTF("error during select: %d\n", errno);
+ return -1;
+ }
+ rc = write(fd, buf + cur, len - cur);
+ }
+ /*
+ RPRINTF("wrote %d bytes\n", cur + rc);
+ */
+
+ return 0;
+ FD_ZERO(&wfds);
+ FD_SET(fd, &wfds);
+ select(fd + 1, NULL, &wfds, NULL, &tv);
+}
+
+
+static void inline close_stream_fd(struct tdremus_state *s)
+{
+ /* XXX: -2 is magic. replace with macro perhaps? */
+ tapdisk_server_unregister_event(s->stream_fd.id);
+ close(s->stream_fd.fd);
+ s->stream_fd.fd = -2;
+}
+
+/* primary functions */
+static void remus_client_event(event_id_t, char mode, void *private);
+static void remus_connect_event(event_id_t id, char mode, void *private);
+static void remus_retry_connect_event(event_id_t id, char mode, void *private);
+
+static int primary_do_connect(struct tdremus_state *state)
+{
+ event_id_t id;
+ int fd;
+ int rc;
+ int flags;
+
+ RPRINTF("client connecting to %s:%d...\n", inet_ntoa(state->sa.sin_addr), ntohs(state->sa.sin_port));
+
+ if ((fd = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
+ RPRINTF("could not create client socket: %d\n", errno);
+ return -1;
+ }
+
+ /* make socket nonblocking */
+ if ((flags = fcntl(fd, F_GETFL, 0)) == -1)
+ flags = 0;
+ if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1)
+ return -1;
+
+ /* once we have created the socket and populated the address, we can now start
+ * our non-blocking connect. rather than duplicating code we trigger a timeout
+ * on the socket fd, which calls out nonblocking connect code
+ */
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, fd, 0, remus_retry_connect_event, state)) < 0) {
+ RPRINTF("error registering timeout client connection event handler: %s\n", strerror(id));
+ /* TODO: we leak a fd here */
+ return -1;
+ }
+ state->stream_fd.fd = fd;
+ state->stream_fd.id = id;
+ return 0;
+}
+
+static int primary_blocking_connect(struct tdremus_state *state)
+{
+ int fd;
+ int id;
+ int rc;
+ int flags;
+
+ RPRINTF("client connecting to %s:%d...\n", inet_ntoa(state->sa.sin_addr), ntohs(state->sa.sin_port));
+
+ if ((fd = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
+ RPRINTF("could not create client socket: %d\n", errno);
+ return -1;
+ }
+
+ do {
+ if ((rc = connect(fd, (struct sockaddr *)&state->sa,
+ sizeof(state->sa))) < 0)
+ {
+ if (errno == ECONNREFUSED) {
+ RPRINTF("connection refused -- retrying in 1 second\n");
+ sleep(1);
+ } else {
+ RPRINTF("connection failed: %d\n", errno);
+ close(fd);
+ return -1;
+ }
+ }
+ } while (rc < 0);
+
+ RPRINTF("client connected\n");
+
+ /* make socket nonblocking */
+ if ((flags = fcntl(fd, F_GETFL, 0)) == -1)
+ flags = 0;
+ if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1)
+ {
+ RPRINTF("error making socket nonblocking\n");
+ close(fd);
+ return -1;
+ }
+
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, fd, 0, remus_client_event, state)) < 0) {
+ RPRINTF("error registering client event handler: %s\n", strerror(id));
+ close(fd);
+ return -1;
+ }
+
+ state->stream_fd.fd = fd;
+ state->stream_fd.id = id;
+ return 0;
+}
+
+/* on read, just pass request through */
+static void primary_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ /* just pass read through */
+ td_forward_request(treq);
+}
+
+/* TODO:
+ * The primary uses mwrite() to write the contents of a write request to the
+ * backup. This effectively blocks until all data has been copied into a system
+ * buffer or a timeout has occured. We may wish to instead use tapdisk's
+ * nonblocking i/o interface, tapdisk_server_register_event(), to set timeouts
+ * and write data in an asynchronous fashion.
+ */
+static void primary_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ char header[sizeof(uint32_t) + sizeof(uint64_t)];
+ uint32_t *sectors = (uint32_t *)header;
+ uint64_t *sector = (uint64_t *)(header + sizeof(uint32_t));
+
+ // RPRINTF("write: stream_fd.fd: %d\n", s->stream_fd.fd);
+
+ /* -1 means we haven't connected yet, -2 means the connection was lost */
+ if(s->stream_fd.fd == -1) {
+ RPRINTF("connecting to backup...\n");
+ primary_blocking_connect(s);
+ }
+
+ *sectors = treq.secs;
+ *sector = treq.sec;
+
+ if (mwrite(s->stream_fd.fd, TDREMUS_WRITE, strlen(TDREMUS_WRITE)) < 0)
+ goto fail;
+ if (mwrite(s->stream_fd.fd, header, sizeof(header)) < 0)
+ goto fail;
+
+ if (mwrite(s->stream_fd.fd, treq.buf, treq.secs * driver->info.sector_size) < 0)
+ goto fail;
+
+ td_forward_request(treq);
+
+ return;
+
+ fail:
+ /* switch to unprotected mode and tell tapdisk to retry */
+ RPRINTF("write request replication failed, switching to unprotected mode");
+ switch_mode(s->tdremus_driver, mode_unprotected);
+ td_complete_request(treq, -EBUSY);
+}
+
+
+static int client_flush(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ // RPRINTF("committing output\n");
+
+ if (s->stream_fd.fd == -1)
+ /* connection not yet established, nothing to flush */
+ return 0;
+
+ if (mwrite(s->stream_fd.fd, TDREMUS_COMMIT, strlen(TDREMUS_COMMIT)) < 0) {
+ RPRINTF("error flushing output");
+ close_stream_fd(s);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int server_flush(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ /*
+ * Nothing to flush in beginning.
+ */
+ if (!s->ramdisk.prev)
+ return 0;
+ /* Try to flush any remaining requests */
+ return ramdisk_flush(driver, s);
+}
+
+static int primary_start(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ RPRINTF("activating client mode\n");
+
+ tapdisk_remus.td_queue_read = primary_queue_read;
+ tapdisk_remus.td_queue_write = primary_queue_write;
+ s->queue_flush = client_flush;
+
+ s->stream_fd.fd = -1;
+ s->stream_fd.id = -1;
+
+ return 0;
+}
+
+/* timeout callback */
+static void remus_retry_connect_event(event_id_t id, char mode, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)private;
+
+ /* do a non-blocking connect */
+ if (connect(s->stream_fd.fd, (struct sockaddr *)&s->sa, sizeof(s->sa))
+ && errno != EINPROGRESS)
+ {
+ if(errno == ECONNREFUSED || errno == ENETUNREACH || errno == EAGAIN || errno == ECONNABORTED)
+ {
+ /* try again in a second */
+ tapdisk_server_unregister_event(s->stream_fd.id);
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, s->stream_fd.fd, REMUS_CONNRETRY_TIMEOUT, remus_retry_connect_event, s)) < 0) {
+ RPRINTF("error registering timeout client connection event handler: %s\n", strerror(id));
+ return;
+ }
+ s->stream_fd.id = id;
+ }
+ else
+ {
+ /* not recoverable */
+ RPRINTF("error connection to server %s\n", strerror(errno));
+ return;
+ }
+ }
+ else
+ {
+ /* the connect returned EINPROGRESS (nonblocking connect) we must wait for the fd to be writeable to determine if the connect worked */
+
+ tapdisk_server_unregister_event(s->stream_fd.id);
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_WRITE_FD, s->stream_fd.fd, 0, remus_connect_event, s)) < 0) {
+ RPRINTF("error registering client connection event handler: %s\n", strerror(id));
+ return;
+ }
+ s->stream_fd.id = id;
+ }
+}
+
+/* callback when nonblocking connect() is finished */
+/* called only by primary in unprotected state */
+static void remus_connect_event(event_id_t id, char mode, void *private)
+{
+ int socket_errno;
+ socklen_t socket_errno_size;
+ struct tdremus_state *s = (struct tdremus_state *)private;
+
+ /* check to se if the connect succeeded */
+ socket_errno_size = sizeof(socket_errno);
+ if (getsockopt(s->stream_fd.fd, SOL_SOCKET, SO_ERROR, &socket_errno, &socket_errno_size)) {
+ RPRINTF("error getting socket errno\n");
+ return;
+ }
+
+ RPRINTF("socket connect returned %d\n", socket_errno);
+
+ if(socket_errno)
+ {
+ /* the connect did not succeed */
+
+ if(socket_errno == ECONNREFUSED || socket_errno == ENETUNREACH || socket_errno == ETIMEDOUT
+ || socket_errno == ECONNABORTED || socket_errno == EAGAIN)
+ {
+ /* we can probably assume that the backup is down. just try again later */
+ tapdisk_server_unregister_event(s->stream_fd.id);
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, s->stream_fd.fd, REMUS_CONNRETRY_TIMEOUT, remus_retry_connect_event, s)) < 0) {
+ RPRINTF("error registering timeout client connection event handler: %s\n", strerror(id));
+ return;
+ }
+ s->stream_fd.id = id;
+ }
+ else
+ {
+ RPRINTF("socket connect returned %d, giving up\n", socket_errno);
+ }
+ }
+ else
+ {
+ /* the connect succeeded */
+
+ /* unregister this function and register a new event handler */
+ tapdisk_server_unregister_event(s->stream_fd.id);
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, s->stream_fd.fd, 0, remus_client_event, s)) < 0) {
+ RPRINTF("error registering client event handler: %s\n", strerror(id));
+ return;
+ }
+ s->stream_fd.id = id;
+
+ /* switch from unprotected to protected client */
+ switch_mode(s->tdremus_driver, mode_primary);
+ }
+}
+
+
+/* we install this event handler on the primary once we have connected to the backup */
+/* wait for "done" message to commit checkpoint */
+static void remus_client_event(event_id_t id, char mode, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)private;
+ char req[5];
+ int rc;
+
+ if (mread(s->stream_fd.fd, req, sizeof(req) - 1) < 0) {
+ /* replication stream closed or otherwise broken (timeout, reset, &c) */
+ RPRINTF("error reading from backup\n");
+ close_stream_fd(s);
+ return;
+ }
+
+ req[4] = '\0';
+
+ if (!strcmp(req, TDREMUS_DONE))
+ /* checkpoint committed, inform msg_fd */
+ ctl_respond(s, TDREMUS_DONE);
+ else {
+ RPRINTF("received unknown message: %s\n", req);
+ close_stream_fd(s);
+ }
+
+ return;
+}
+
+/* backup functions */
+static void remus_server_event(event_id_t id, char mode, void *private);
+
+/* returns the socket that receives write requests */
+static void remus_server_accept(event_id_t id, char mode, void* private)
+{
+ struct tdremus_state* s = (struct tdremus_state *) private;
+
+ int stream_fd;
+ event_id_t cid;
+
+ /* XXX: add address-based black/white list */
+ if ((stream_fd = accept(s->server_fd.fd, NULL, NULL)) < 0) {
+ RPRINTF("error accepting connection: %d\n", errno);
+ return;
+ }
+
+ /* TODO: check to see if we are already replicating. if so just close the
+ * connection (or do something smarter) */
+ RPRINTF("server accepted connection\n");
+
+ /* add tapdisk event for replication stream */
+ cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, stream_fd, 0,
+ remus_server_event, s);
+
+ if(cid < 0) {
+ RPRINTF("error registering connection event handler: %s\n", strerror(errno));
+ close(stream_fd);
+ return;
+ }
+
+ /* store replication file descriptor */
+ s->stream_fd.fd = stream_fd;
+ s->stream_fd.id = cid;
+}
+
+/* returns -2 if EADDRNOTAVAIL */
+static int remus_bind(struct tdremus_state* s)
+{
+// struct sockaddr_in sa;
+ int opt;
+ int rc = -1;
+
+ if ((s->server_fd.fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ RPRINTF("could not create server socket: %d\n", errno);
+ return rc;
+ }
+ opt = 1;
+ if (setsockopt(s->server_fd.fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0)
+ RPRINTF("Error setting REUSEADDR on %d: %d\n", s->server_fd.fd, errno);
+
+ if (bind(s->server_fd.fd, (struct sockaddr *)&s->sa, sizeof(s->sa)) < 0) {
+ RPRINTF("could not bind server socket %d to %s:%d: %d %s\n", s->server_fd.fd,
+ inet_ntoa(s->sa.sin_addr), ntohs(s->sa.sin_port), errno, strerror(errno));
+ if (errno != EADDRINUSE)
+ rc = -2;
+ goto err_sfd;
+ }
+ if (listen(s->server_fd.fd, 10)) {
+ RPRINTF("could not listen on socket: %d\n", errno);
+ goto err_sfd;
+ }
+
+ /* The socket s now bound to the address and listening so we may now register
+ * the fd with tapdisk */
+
+ if((s->server_fd.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ s->server_fd.fd, 0,
+ remus_server_accept, s)) < 0) {
+ RPRINTF("error registering server connection event handler: %s",
+ strerror(s->server_fd.id));
+ goto err_sfd;
+ }
+
+ return 0;
+
+ err_sfd:
+ close(s->server_fd.fd);
+ s->server_fd.fd = -1;
+
+ return rc;
+}
+
+/* wait for latest checkpoint to be applied */
+static inline int server_writes_inflight(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ if (!s->ramdisk.inflight && !s->ramdisk.prev)
+ return 0;
+
+ return 1;
+}
+
+/* Due to block device prefetching this code may be called on the server side
+ * during normal replication. In this case we must return EBUSY, otherwise the
+ * domain may be started with stale data.
+ */
+void backup_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ int i;
+ if(!remus_image)
+ remus_image = treq.image;
+
+ /* check if this read is queued in any currently ongoing flush */
+ if (ramdisk_read(&s->ramdisk, treq.sec, treq.secs, treq.buf)) {
+ /* TODO: Add to pending read hash */
+ td_forward_request(treq);
+ } else {
+ /* complete the request */
+ td_complete_request(treq, 0);
+ }
+}
+
+/* see above */
+void backup_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ /* on a server write, we know the domain has failed over. we must change our
+ * state to unprotected and then have the unprotected queue_write function
+ * handle the write
+ */
+
+ switch_mode(driver, mode_unprotected);
+ /* TODO: call the appropriate write function rather than return EBUSY */
+ td_complete_request(treq, -EBUSY);
+}
+
+static int backup_start(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ int fd;
+
+ if (ramdisk_start(driver) < 0)
+ return -1;
+
+ tapdisk_remus.td_queue_read = backup_queue_read;
+ tapdisk_remus.td_queue_write = backup_queue_write;
+ s->queue_flush = server_flush;
+ /* TODO set flush function */
+ return 0;
+}
+
+static int server_do_wreq(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ static tdremus_wire_t twreq;
+ char buf[4096];
+ int len, rc;
+
+ char header[sizeof(uint32_t) + sizeof(uint64_t)];
+ uint32_t *sectors = (uint32_t *) header;
+ uint64_t *sector = (uint64_t *) &header[sizeof(uint32_t)];
+
+ // RPRINTF("received write request\n");
+
+ if (mread(s->stream_fd.fd, header, sizeof(header)) < 0)
+ goto err;
+
+ len = *sectors * driver->info.sector_size;
+
+ //RPRINTF("writing %d sectors (%d bytes) starting at %" PRIu64 "\n", *sectors, len,
+ // *sector);
+
+ if (len > sizeof(buf)) {
+ /* freak out! */
+ RPRINTF("write request too large: %d/%u\n", len, (unsigned)sizeof(buf));
+ return -1;
+ }
+
+ if (mread(s->stream_fd.fd, buf, len) < 0)
+ goto err;
+
+ if (ramdisk_write(&s->ramdisk, *sector, *sectors, buf) < 0)
+ goto err;
+
+ return 0;
+
+ err:
+ /* should start failover */
+ RPRINTF("backup write request error\n");
+ close_stream_fd(s);
+
+ return -1;
+}
+
+static int server_do_sreq(td_driver_t *driver)
+{
+ /*
+ RPRINTF("submit request received\n");
+ */
+
+ return 0;
+}
+
+/* at this point, the server can start applying the most recent
+ * ramdisk. */
+static int server_do_creq(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ // RPRINTF("committing buffer\n");
+
+ ramdisk_start_flush(driver);
+
+ /* XXX this message should not be sent until flush completes! */
+ if (write(s->stream_fd.fd, TDREMUS_DONE, strlen(TDREMUS_DONE)) != 4)
+ return -1;
+
+ return 0;
+}
+
+
+/* called when data is pending in s->rfd */
+static void remus_server_event(event_id_t id, char mode, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)private;
+ td_driver_t *driver = s->tdremus_driver;
+ char req[5];
+
+ // RPRINTF("replication data waiting\n");
+
+ /* TODO: add a get_connection_by_event_id() function.
+ * for now we can assume that the fd is s->stream_fd */
+
+ if (mread(s->stream_fd.fd, req, sizeof(req) - 1) < 0) {
+ RPRINTF("error reading server event, activating backup\n");
+ switch_mode(driver, mode_unprotected);
+ return;
+ }
+
+ req[4] = '\0';
+
+ if (!strcmp(req, TDREMUS_WRITE))
+ server_do_wreq(driver);
+ else if (!strcmp(req, TDREMUS_SUBMIT))
+ server_do_sreq(driver);
+ else if (!strcmp(req, TDREMUS_COMMIT))
+ server_do_creq(driver);
+ else
+ RPRINTF("unknown request received: %s\n", req);
+
+ return;
+
+}
+
+/* unprotected */
+
+void unprotected_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ /* wait for previous ramdisk to flush before servicing reads */
+ if (server_writes_inflight(driver)) {
+ /* for now lets just return EBUSY.
+ * if there are any left-over requests in prev,
+ * kick em again.
+ */
+ if(!s->ramdisk.inflight) /* nothing in inprogress */
+ ramdisk_flush(driver, s);
+
+ td_complete_request(treq, -EBUSY);
+ }
+ else {
+ /* here we just pass reads through */
+ td_forward_request(treq);
+ }
+}
+
+/* For a recoverable remus solution we need to log unprotected writes here */
+void unprotected_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ /* wait for previous ramdisk to flush */
+ if (server_writes_inflight(driver)) {
+ RPRINTF("queue_write: waiting for queue to drain");
+ if(!s->ramdisk.inflight) /* nothing in inprogress. Kick prev */
+ ramdisk_flush(driver, s);
+ td_complete_request(treq, -EBUSY);
+ }
+ else {
+ // RPRINTF("servicing write request on backup\n");
+ /* NOTE: DRBD style bitmap tracking could go here */
+ td_forward_request(treq);
+ }
+}
+
+static int unprotected_start(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ RPRINTF("failure detected, activating passthrough\n");
+
+ /* close the server socket */
+ close_stream_fd(s);
+
+ /* unregister the replication stream */
+ tapdisk_server_unregister_event(s->server_fd.id);
+
+ /* close the replication stream */
+ close(s->server_fd.fd);
+ s->server_fd.fd = -1;
+
+ /* install the unprotected read/write handlers */
+ tapdisk_remus.td_queue_read = unprotected_queue_read;
+ tapdisk_remus.td_queue_write = unprotected_queue_write;
+
+ return 0;
+}
+
+
+/* control */
+
+static inline int resolve_address(const char* addr, struct in_addr* ia)
+{
+ struct hostent* he;
+ uint32_t ip;
+
+ if (!(he = gethostbyname(addr))) {
+ RPRINTF("error resolving %s: %d\n", addr, h_errno);
+ return -1;
+ }
+
+ if (!he->h_addr_list[0]) {
+ RPRINTF("no address found for %s\n", addr);
+ return -1;
+ }
+
+ /* network byte order */
+ ip = *((uint32_t**)he->h_addr_list)[0];
+ ia->s_addr = ip;
+
+ return 0;
+}
+
+static int get_args(td_driver_t *driver, const char* name)
+{
+ struct tdremus_state *state = (struct tdremus_state *)driver->data;
+ char* host;
+ char* port;
+// char* driver_str;
+// char* parent;
+// int type;
+// char* path;
+// unsigned long ulport;
+// int i;
+// struct sockaddr_in server_addr_in;
+
+ int gai_status;
+ int valid_addr;
+ struct addrinfo gai_hints;
+ struct addrinfo *servinfo, *servinfo_itr;
+
+ memset(&gai_hints, 0, sizeof gai_hints);
+ gai_hints.ai_family = AF_UNSPEC;
+ gai_hints.ai_socktype = SOCK_STREAM;
+
+ port = strchr(name, ':');
+ if (!port) {
+ RPRINTF("missing host in %s\n", name);
+ return -ENOENT;
+ }
+ if (!(host = strndup(name, port - name))) {
+ RPRINTF("unable to allocate host\n");
+ return -ENOMEM;
+ }
+ port++;
+
+ if ((gai_status = getaddrinfo(host, port, &gai_hints, &servinfo)) != 0) {
+ RPRINTF("getaddrinfo error: %s\n", gai_strerror(gai_status));
+ return -ENOENT;
+ }
+
+ /* TODO: do something smarter here */
+ valid_addr = 0;
+ for(servinfo_itr = servinfo; servinfo_itr != NULL; servinfo_itr = servinfo_itr->ai_next) {
+ void *addr;
+ char *ipver;
+
+ if (servinfo_itr->ai_family == AF_INET) {
+ valid_addr = 1;
+ memset(&state->sa, 0, sizeof(state->sa));
+ state->sa = *(struct sockaddr_in *)servinfo_itr->ai_addr;
+ break;
+ }
+ }
+ freeaddrinfo(servinfo);
+
+ if (!valid_addr)
+ return -ENOENT;
+
+ RPRINTF("host: %s, port: %d\n", inet_ntoa(state->sa.sin_addr), ntohs(state->sa.sin_port));
+
+ return 0;
+}
+
+static int switch_mode(td_driver_t *driver, enum tdremus_mode mode)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ int rc;
+
+ if (mode == s->mode)
+ return 0;
+
+ if (s->queue_flush)
+ if ((rc = s->queue_flush(driver)) < 0) {
+ // fall back to unprotected mode on error
+ RPRINTF("switch_mode: error flushing queue (old: %d, new: %d)", s->mode, mode);
+ mode = mode_unprotected;
+ }
+
+ if (mode == mode_unprotected)
+ rc = unprotected_start(driver);
+ else if (mode == mode_primary)
+ rc = primary_start(driver);
+ else if (mode == mode_backup)
+ rc = backup_start(driver);
+ else {
+ RPRINTF("unknown mode requested: %d\n", mode);
+ rc = -1;
+ }
+
+ if (!rc)
+ s->mode = mode;
+
+ return rc;
+}
+
+static void ctl_request(event_id_t id, char mode, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)private;
+ td_driver_t *driver = s->tdremus_driver;
+ char msg[80];
+ int rc;
+
+ // RPRINTF("data waiting on control fifo\n");
+
+ if (!(rc = read(s->ctl_fd.fd, msg, sizeof(msg) - 1 /* append nul */))) {
+ RPRINTF("0-byte read received, reopening FIFO\n");
+ /*TODO: we may have to unregister/re-register with tapdisk_server */
+ close(s->ctl_fd.fd);
+ RPRINTF("FIFO closed\n");
+ if ((s->ctl_fd.fd = open(s->ctl_path, O_RDWR)) < 0) {
+ RPRINTF("error reopening FIFO: %d\n", errno);
+ }
+ return;
+ }
+
+ if (rc < 0) {
+ RPRINTF("error reading from FIFO: %d\n", errno);
+ return;
+ }
+
+ /* TODO: need to get driver somehow */
+ msg[rc] = '\0';
+ if (!strncmp(msg, "flush", 5)) {
+ if (s->queue_flush)
+ if ((rc = s->queue_flush(driver))) {
+ RPRINTF("error passing flush request to backup");
+ ctl_respond(s, TDREMUS_FAIL);
+ }
+ } else {
+ RPRINTF("unknown command: %s\n", msg);
+ }
+}
+
+static int ctl_respond(struct tdremus_state *s, const char *response)
+{
+ int rc;
+
+ if ((rc = write(s->msg_fd.fd, response, strlen(response))) < 0) {
+ RPRINTF("error writing notification: %d\n", errno);
+ close(s->msg_fd.fd);
+ if ((s->msg_fd.fd = open(s->msg_path, O_RDWR)) < 0)
+ RPRINTF("error reopening FIFO: %d\n", errno);
+ }
+
+ return rc;
+}
+
+/* must be called after the underlying driver has been initialized */
+static int ctl_open(td_driver_t *driver, const char* name)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ int i, l;
+
+ /* first we must ensure that BLKTAP_CTRL_DIR exists */
+ if (mkdir(BLKTAP_CTRL_DIR, 0755) && errno != EEXIST)
+ {
+ DPRINTF("error creating directory %s: %d\n", BLKTAP_CTRL_DIR, errno);
+ return -1;
+ }
+
+ /* use the device name to create the control fifo path */
+ if (asprintf(&s->ctl_path, BLKTAP_CTRL_DIR "/remus_%s", name) < 0)
+ return -1;
+ /* scrub fifo pathname */
+ for (i = strlen(BLKTAP_CTRL_DIR) + 1, l = strlen(s->ctl_path); i < l; i++) {
+ if (strchr(":/", s->ctl_path[i]))
+ s->ctl_path[i] = '_';
+ }
+ if (asprintf(&s->msg_path, "%s.msg", s->ctl_path) < 0)
+ goto err_ctlfifo;
+
+ if (mkfifo(s->ctl_path, S_IRWXU|S_IRWXG|S_IRWXO) && errno != EEXIST) {
+ RPRINTF("error creating control FIFO %s: %d\n", s->ctl_path, errno);
+ goto err_msgfifo;
+ }
+
+ if (mkfifo(s->msg_path, S_IRWXU|S_IRWXG|S_IRWXO) && errno != EEXIST) {
+ RPRINTF("error creating message FIFO %s: %d\n", s->msg_path, errno);
+ goto err_msgfifo;
+ }
+
+ /* RDWR so that fd doesn't block select when no writer is present */
+ if ((s->ctl_fd.fd = open(s->ctl_path, O_RDWR)) < 0) {
+ RPRINTF("error opening control FIFO %s: %d\n", s->ctl_path, errno);
+ goto err_msgfifo;
+ }
+
+ if ((s->msg_fd.fd = open(s->msg_path, O_RDWR)) < 0) {
+ RPRINTF("error opening message FIFO %s: %d\n", s->msg_path, errno);
+ goto err_openctlfifo;
+ }
+
+ RPRINTF("control FIFO %s\n", s->ctl_path);
+ RPRINTF("message FIFO %s\n", s->msg_path);
+
+ return 0;
+
+ err_openctlfifo:
+ close(s->ctl_fd.fd);
+ err_msgfifo:
+ free(s->msg_path);
+ s->msg_path = NULL;
+ err_ctlfifo:
+ free(s->ctl_path);
+ s->ctl_path = NULL;
+ return -1;
+}
+
+static void ctl_close(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ /* TODO: close *all* connections */
+
+ if(s->ctl_fd.fd)
+ close(s->ctl_fd.fd);
+
+ if (s->ctl_path) {
+ unlink(s->ctl_path);
+ free(s->ctl_path);
+ s->ctl_path = NULL;
+ }
+ if (s->msg_path) {
+ unlink(s->msg_path);
+ free(s->msg_path);
+ s->msg_path = NULL;
+ }
+}
+
+static int ctl_register(struct tdremus_state *s)
+{
+ RPRINTF("registering ctl fifo\n");
+
+ /* register ctl fd */
+ s->ctl_fd.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, s->ctl_fd.fd, 0, ctl_request, s);
+
+ if (s->ctl_fd.id < 0) {
+ RPRINTF("error registering ctrl FIFO %s: %d\n", s->ctl_path, s->ctl_fd.id);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* interface */
+
+static int tdremus_open(td_driver_t *driver, const char *name,
+ td_flag_t flags)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ int rc;
+
+ RPRINTF("opening %s\n", name);
+
+ /* first we need to get the underlying vbd for this driver stack. To do so we
+ * need to know the vbd's id. Fortunately, for tapdisk2 this is hard-coded as
+ * 0 (see tapdisk2.c)
+ */
+ device_vbd = tapdisk_server_get_vbd(0);
+
+ memset(s, 0, sizeof(*s));
+ s->server_fd.fd = -1;
+ s->stream_fd.fd = -1;
+ s->ctl_fd.fd = -1;
+ s->msg_fd.fd = -1;
+
+ /* TODO: this is only needed so that the server can send writes down
+ * the driver stack from the stream_fd event handler */
+ s->tdremus_driver = driver;
+
+ /* parse name to get info etc */
+ if ((rc = get_args(driver, name)))
+ return rc;
+
+ if ((rc = ctl_open(driver, name))) {
+ RPRINTF("error setting up control channel\n");
+ free(s->driver_data);
+ return rc;
+ }
+
+ if ((rc = ctl_register(s))) {
+ RPRINTF("error registering control channel\n");
+ free(s->driver_data);
+ return rc;
+ }
+
+ if (!(rc = remus_bind(s)))
+ rc = switch_mode(driver, mode_backup);
+ else if (rc == -2)
+ rc = switch_mode(driver, mode_primary);
+
+ if (!rc)
+ return 0;
+
+ tdremus_close(driver);
+ return -EIO;
+}
+
+static int tdremus_close(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ RPRINTF("closing\n");
+ if (s->ramdisk.inprogress)
+ hashtable_destroy(s->ramdisk.inprogress, 0);
+
+ if (s->driver_data) {
+ free(s->driver_data);
+ s->driver_data = NULL;
+ }
+ if (s->server_fd.fd >= 0) {
+ close(s->server_fd.fd);
+ s->server_fd.fd = -1;
+ }
+ if (s->stream_fd.fd >= 0)
+ close_stream_fd(s);
+
+ ctl_close(driver);
+
+ return 0;
+}
+
+static int tdremus_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ /* we shouldn't have a parent... for now */
+ return -EINVAL;
+}
+
+static int tdremus_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ return 0;
+}
+
+struct tap_disk tapdisk_remus = {
+ .disk_type = "tapdisk_remus",
+ .private_data_size = sizeof(struct tdremus_state),
+ .td_open = tdremus_open,
+ .td_queue_read = unprotected_queue_read,
+ .td_queue_write = unprotected_queue_write,
+ .td_close = tdremus_close,
+ .td_get_parent_id = tdremus_get_parent_id,
+ .td_validate_parent = tdremus_validate_parent,
+ .td_debug = NULL,
+};
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * A note on write transactions:
+ * Writes that require updating the BAT or bitmaps cannot be signaled
+ * as complete until all updates have reached disk. Transactions are
+ * used to ensure proper ordering in these cases. The two types of
+ * transactions are as follows:
+ * - Bitmap updates only: data writes that require updates to the same
+ * bitmap are grouped in a transaction. Only after all data writes
+ * in a transaction complete does the bitmap write commence. Only
+ * after the bitmap write finishes are the data writes signalled as
+ * complete.
+ * - BAT and bitmap updates: data writes are grouped in transactions
+ * as above, but a special extra write is included in the transaction,
+ * which zeros out the newly allocated bitmap on disk. When the data
+ * writes and the zero-bitmap write complete, the BAT and bitmap writes
+ * are started in parallel. The transaction is completed only after both
+ * the BAT and bitmap writes successfully return.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h> /* for memset. */
+#include <libaio.h>
+#include <sys/mman.h>
+
+#include "libvhd.h"
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+
+unsigned int SPB;
+
+#define DEBUGGING 2
+#define ASSERTING 1
+#define MICROSOFT_COMPAT
+
+#define VHD_BATMAP_MAX_RETRIES 10
+
+#define __TRACE(s) \
+ do { \
+ DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %" \
+ PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: " \
+ "%lu, BBLK: 0x%04x\n", \
+ s->vhd.file, s->queued, s->completed, s->returned, \
+ VHD_REQS_DATA - s->vreq_free_count, \
+ s->bat.pbw_blk); \
+ } while(0)
+
+#define __ASSERT(_p) \
+ if (!(_p)) { \
+ DPRINTF("%s:%d: FAILED ASSERTION: '%s'\n", \
+ __FILE__, __LINE__, #_p); \
+ DBG(TLOG_WARN, "%s:%d: FAILED ASSERTION: '%s'\n", \
+ __FILE__, __LINE__, #_p); \
+ tlog_flush(); \
+ abort(); \
+ }
+
+#if (DEBUGGING == 1)
+ #define DBG(level, _f, _a...) DPRINTF(_f, ##_a)
+ #define ERR(err, _f, _a...) DPRINTF("ERROR: %d: " _f, err, ##_a)
+ #define TRACE(s) ((void)0)
+#elif (DEBUGGING == 2)
+ #define DBG(level, _f, _a...) tlog_write(level, _f, ##_a)
+ #define ERR(err, _f, _a...) tlog_error(err, _f, ##_a)
+ #define TRACE(s) __TRACE(s)
+#else
+ #define DBG(level, _f, _a...) ((void)0)
+ #define ERR(err, _f, _a...) ((void)0)
+ #define TRACE(s) ((void)0)
+#endif
+
+#if (ASSERTING == 1)
+ #define ASSERT(_p) __ASSERT(_p)
+#else
+ #define ASSERT(_p) ((void)0)
+#endif
+
+/******VHD DEFINES******/
+#define VHD_CACHE_SIZE 32
+
+#define VHD_REQS_DATA TAPDISK_DATA_REQUESTS
+#define VHD_REQS_META (VHD_CACHE_SIZE + 2)
+#define VHD_REQS_TOTAL (VHD_REQS_DATA + VHD_REQS_META)
+
+#define VHD_OP_BAT_WRITE 0
+#define VHD_OP_DATA_READ 1
+#define VHD_OP_DATA_WRITE 2
+#define VHD_OP_BITMAP_READ 3
+#define VHD_OP_BITMAP_WRITE 4
+#define VHD_OP_ZERO_BM_WRITE 5
+
+#define VHD_BM_BAT_LOCKED 0
+#define VHD_BM_BAT_CLEAR 1
+#define VHD_BM_BIT_CLEAR 2
+#define VHD_BM_BIT_SET 3
+#define VHD_BM_NOT_CACHED 4
+#define VHD_BM_READ_PENDING 5
+
+#define VHD_FLAG_OPEN_RDONLY 1
+#define VHD_FLAG_OPEN_NO_CACHE 2
+#define VHD_FLAG_OPEN_QUIET 4
+#define VHD_FLAG_OPEN_STRICT 8
+#define VHD_FLAG_OPEN_QUERY 16
+#define VHD_FLAG_OPEN_PREALLOCATE 32
+
+#define VHD_FLAG_BAT_LOCKED 1
+#define VHD_FLAG_BAT_WRITE_STARTED 2
+
+#define VHD_FLAG_BM_UPDATE_BAT 1
+#define VHD_FLAG_BM_WRITE_PENDING 2
+#define VHD_FLAG_BM_READ_PENDING 4
+#define VHD_FLAG_BM_LOCKED 8
+
+#define VHD_FLAG_REQ_UPDATE_BAT 1
+#define VHD_FLAG_REQ_UPDATE_BITMAP 2
+#define VHD_FLAG_REQ_QUEUED 4
+#define VHD_FLAG_REQ_FINISHED 8
+
+#define VHD_FLAG_TX_LIVE 1
+#define VHD_FLAG_TX_UPDATE_BAT 2
+
+typedef uint8_t vhd_flag_t;
+
+struct vhd_state;
+struct vhd_request;
+
+struct vhd_req_list {
+ struct vhd_request *head;
+ struct vhd_request *tail;
+};
+
+struct vhd_transaction {
+ int error;
+ int closed;
+ int started;
+ int finished;
+ vhd_flag_t status;
+ struct vhd_req_list requests;
+};
+
+struct vhd_request {
+ int error;
+ uint8_t op;
+ vhd_flag_t flags;
+ td_request_t treq;
+ struct tiocb tiocb;
+ struct vhd_state *state;
+ struct vhd_request *next;
+ struct vhd_transaction *tx;
+};
+
+struct vhd_bat_state {
+ vhd_bat_t bat;
+ vhd_batmap_t batmap;
+ vhd_flag_t status;
+ uint32_t pbw_blk; /* blk num of pending write */
+ uint64_t pbw_offset; /* file offset of same */
+ struct vhd_request req; /* for writing bat table */
+ struct vhd_request zero_req; /* for initializing bitmaps */
+ char *bat_buf;
+};
+
+struct vhd_bitmap {
+ u32 blk;
+ u64 seqno; /* lru sequence number */
+ vhd_flag_t status;
+
+ char *map; /* map should only be modified
+ * in finish_bitmap_write */
+ char *shadow; /* in-memory bitmap changes are
+ * made to shadow and copied to
+ * map only after having been
+ * flushed to disk */
+ struct vhd_transaction tx; /* transaction data structure
+ * encapsulating data, bitmap,
+ * and bat writes */
+ struct vhd_req_list queue; /* data writes waiting for next
+ * transaction */
+ struct vhd_req_list waiting; /* pending requests that cannot
+ * be serviced until this bitmap
+ * is read from disk */
+ struct vhd_request req;
+};
+
+struct vhd_state {
+ vhd_flag_t flags;
+
+ /* VHD stuff */
+ vhd_context_t vhd;
+ u32 spp; /* sectors per page */
+ u32 spb; /* sectors per block */
+ u64 next_db; /* pointer to the next
+ * (unallocated) datablock */
+
+ struct vhd_bat_state bat;
+
+ u64 bm_lru; /* lru sequence number */
+ u32 bm_secs; /* size of bitmap, in sectors */
+ struct vhd_bitmap *bitmap[VHD_CACHE_SIZE];
+
+ int bm_free_count;
+ struct vhd_bitmap *bitmap_free[VHD_CACHE_SIZE];
+ struct vhd_bitmap bitmap_list[VHD_CACHE_SIZE];
+
+ int vreq_free_count;
+ struct vhd_request *vreq_free[VHD_REQS_DATA];
+ struct vhd_request vreq_list[VHD_REQS_DATA];
+
+ td_driver_t *driver;
+
+ uint64_t queued;
+ uint64_t completed;
+ uint64_t returned;
+ uint64_t reads;
+ uint64_t read_size;
+ uint64_t writes;
+ uint64_t write_size;
+};
+
+#define test_vhd_flag(word, flag) ((word) & (flag))
+#define set_vhd_flag(word, flag) ((word) |= (flag))
+#define clear_vhd_flag(word, flag) ((word) &= ~(flag))
+
+#define bat_entry(s, blk) ((s)->bat.bat.bat[(blk)])
+
+static void vhd_complete(void *, struct tiocb *, int);
+static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *);
+
+static struct vhd_state *_vhd_master;
+static unsigned long _vhd_zsize;
+static char *_vhd_zeros;
+
+static int
+vhd_initialize(struct vhd_state *s)
+{
+ if (_vhd_zeros)
+ return 0;
+
+ _vhd_zsize = 2 * getpagesize();
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
+ _vhd_zsize += VHD_BLOCK_SIZE;
+
+ _vhd_zeros = mmap(0, _vhd_zsize, PROT_READ,
+ MAP_SHARED | MAP_ANON, -1, 0);
+ if (_vhd_zeros == MAP_FAILED) {
+ EPRINTF("vhd_initialize failed: %d\n", -errno);
+ _vhd_zeros = NULL;
+ _vhd_zsize = 0;
+ return -errno;
+ }
+
+ _vhd_master = s;
+ return 0;
+}
+
+static void
+vhd_free(struct vhd_state *s)
+{
+ if (_vhd_master != s || !_vhd_zeros)
+ return;
+
+ munmap(_vhd_zeros, _vhd_zsize);
+ _vhd_zsize = 0;
+ _vhd_zeros = NULL;
+ _vhd_master = NULL;
+}
+
+static char *
+_get_vhd_zeros(const char *func, unsigned long size)
+{
+ if (!_vhd_zeros || _vhd_zsize < size) {
+ EPRINTF("invalid zero request from %s: %lu, %lu, %p\n",
+ func, size, _vhd_zsize, _vhd_zeros);
+ ASSERT(0);
+ }
+
+ return _vhd_zeros;
+}
+
+#define vhd_zeros(size) _get_vhd_zeros(__func__, size)
+
+static inline void
+set_batmap(struct vhd_state *s, uint32_t blk)
+{
+ if (s->bat.batmap.map) {
+ vhd_batmap_set(&s->vhd, &s->bat.batmap, blk);
+ DBG(TLOG_DBG, "block 0x%x completely full\n", blk);
+ }
+}
+
+static inline int
+test_batmap(struct vhd_state *s, uint32_t blk)
+{
+ if (!s->bat.batmap.map)
+ return 0;
+ return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk);
+}
+
+static int
+vhd_kill_footer(struct vhd_state *s)
+{
+ int err;
+ off_t end;
+ char *zeros;
+
+ if (s->vhd.footer.type == HD_TYPE_FIXED)
+ return 0;
+
+ err = posix_memalign((void **)&zeros, 512, 512);
+ if (err)
+ return -err;
+
+ err = 1;
+ memset(zeros, 0xc7c7c7c7, 512);
+
+ if ((end = lseek(s->vhd.fd, 0, SEEK_END)) == -1)
+ goto fail;
+
+ if (lseek(s->vhd.fd, (end - 512), SEEK_SET) == -1)
+ goto fail;
+
+ if (write(s->vhd.fd, zeros, 512) != 512)
+ goto fail;
+
+ err = 0;
+
+ fail:
+ free(zeros);
+ if (err)
+ return (errno ? -errno : -EIO);
+ return 0;
+}
+
+static inline int
+find_next_free_block(struct vhd_state *s)
+{
+ int err;
+ off_t eom;
+ uint32_t i, entry;
+
+ err = vhd_end_of_headers(&s->vhd, &eom);
+ if (err)
+ return err;
+
+ s->next_db = secs_round_up(eom);
+
+ for (i = 0; i < s->bat.bat.entries; i++) {
+ entry = bat_entry(s, i);
+ if (entry != DD_BLK_UNUSED && entry >= s->next_db)
+ s->next_db = entry + s->spb + s->bm_secs;
+ }
+
+ return 0;
+}
+
+static void
+vhd_free_bat(struct vhd_state *s)
+{
+ free(s->bat.bat.bat);
+ free(s->bat.batmap.map);
+ free(s->bat.bat_buf);
+ memset(&s->bat, 0, sizeof(struct vhd_bat));
+}
+
+static int
+vhd_initialize_bat(struct vhd_state *s)
+{
+ int err, psize, batmap_required, i;
+
+ memset(&s->bat, 0, sizeof(struct vhd_bat));
+
+ psize = getpagesize();
+
+ err = vhd_read_bat(&s->vhd, &s->bat.bat);
+ if (err) {
+ EPRINTF("%s: reading bat: %d\n", s->vhd.file, err);
+ return err;
+ }
+
+ batmap_required = 1;
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) {
+ batmap_required = 0;
+ } else {
+ err = find_next_free_block(s);
+ if (err)
+ goto fail;
+ }
+
+ if (vhd_has_batmap(&s->vhd)) {
+ for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) {
+ err = vhd_read_batmap(&s->vhd, &s->bat.batmap);
+ if (err) {
+ EPRINTF("%s: reading batmap: %d\n",
+ s->vhd.file, err);
+ if (batmap_required)
+ goto fail;
+ } else {
+ break;
+ }
+ }
+ if (err)
+ EPRINTF("%s: ignoring non-critical batmap error\n",
+ s->vhd.file);
+ }
+
+ err = posix_memalign((void **)&s->bat.bat_buf,
+ VHD_SECTOR_SIZE, VHD_SECTOR_SIZE);
+ if (err) {
+ s->bat.bat_buf = NULL;
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ vhd_free_bat(s);
+ return err;
+}
+
+static void
+vhd_free_bitmap_cache(struct vhd_state *s)
+{
+ int i;
+ struct vhd_bitmap *bm;
+
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ bm = s->bitmap_list + i;
+ free(bm->map);
+ free(bm->shadow);
+ s->bitmap_free[i] = NULL;
+ }
+
+ memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
+}
+
+static int
+vhd_initialize_bitmap_cache(struct vhd_state *s)
+{
+ int i, err, map_size;
+ struct vhd_bitmap *bm;
+
+ memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
+
+ s->bm_lru = 0;
+ map_size = vhd_sectors_to_bytes(s->bm_secs);
+ s->bm_free_count = VHD_CACHE_SIZE;
+
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ bm = s->bitmap_list + i;
+
+ err = posix_memalign((void **)&bm->map, 512, map_size);
+ if (err) {
+ bm->map = NULL;
+ goto fail;
+ }
+
+ err = posix_memalign((void **)&bm->shadow, 512, map_size);
+ if (err) {
+ bm->shadow = NULL;
+ goto fail;
+ }
+
+ memset(bm->map, 0, map_size);
+ memset(bm->shadow, 0, map_size);
+ s->bitmap_free[i] = bm;
+ }
+
+ return 0;
+
+fail:
+ vhd_free_bitmap_cache(s);
+ return err;
+}
+
+static int
+vhd_initialize_dynamic_disk(struct vhd_state *s)
+{
+ int err;
+
+ err = vhd_get_header(&s->vhd);
+ if (err) {
+ if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+ EPRINTF("Error reading VHD DD header.\n");
+ return err;
+ }
+
+ if (s->vhd.header.hdr_ver != 0x00010000) {
+ EPRINTF("unsupported header version! (0x%x)\n",
+ s->vhd.header.hdr_ver);
+ return -EINVAL;
+ }
+
+ s->spp = getpagesize() >> VHD_SECTOR_SHIFT;
+ s->spb = s->vhd.header.block_size >> VHD_SECTOR_SHIFT;
+ s->bm_secs = secs_round_up_no_zero(s->spb >> 3);
+
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE))
+ return 0;
+
+ err = vhd_initialize_bat(s);
+ if (err)
+ return err;
+
+ err = vhd_initialize_bitmap_cache(s);
+ if (err) {
+ vhd_free_bat(s);
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+vhd_check_version(struct vhd_state *s)
+{
+ if (strncmp(s->vhd.footer.crtr_app, "tap", 3))
+ return 0;
+
+ if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) {
+ if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+ EPRINTF("WARNING: %s vhd creator version 0x%08x, "
+ "but only versions up to 0x%08x are "
+ "supported for IO\n", s->vhd.file,
+ s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION);
+
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void
+vhd_log_open(struct vhd_state *s)
+{
+ char buf[5];
+ uint32_t i, allocated, full;
+
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+ return;
+
+ snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app);
+ if (!vhd_type_dynamic(&s->vhd)) {
+ DPRINTF("%s version: %s 0x%08x\n",
+ s->vhd.file, buf, s->vhd.footer.crtr_ver);
+ return;
+ }
+
+ allocated = 0;
+ full = 0;
+
+ for (i = 0; i < s->bat.bat.entries; i++) {
+ if (bat_entry(s, i) != DD_BLK_UNUSED)
+ allocated++;
+ if (test_batmap(s, i))
+ full++;
+ }
+
+ DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n",
+ s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries,
+ allocated, full, s->next_db);
+}
+
+static int
+__vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags)
+{
+ int i, o_flags, err;
+ struct vhd_state *s;
+
+ DBG(TLOG_INFO, "vhd_open: %s\n", name);
+ if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT))
+ libvhd_set_log_level(1);
+
+ s = (struct vhd_state *)driver->data;
+ memset(s, 0, sizeof(struct vhd_state));
+
+ s->flags = flags;
+ s->driver = driver;
+
+ err = vhd_initialize(s);
+ if (err)
+ return err;
+
+ o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ?
+ VHD_OPEN_RDONLY : VHD_OPEN_RDWR);
+
+ err = vhd_open(&s->vhd, name, o_flags);
+ if (err) {
+ libvhd_set_log_level(1);
+ err = vhd_open(&s->vhd, name, o_flags);
+ if (err) {
+ EPRINTF("Unable to open [%s] (%d)!\n", name, err);
+ return err;
+ }
+ }
+
+ err = vhd_check_version(s);
+ if (err)
+ goto fail;
+
+ s->spb = s->spp = 1;
+
+ if (vhd_type_dynamic(&s->vhd)) {
+ err = vhd_initialize_dynamic_disk(s);
+ if (err)
+ goto fail;
+ }
+
+ vhd_log_open(s);
+
+ SPB = s->spb;
+
+ s->vreq_free_count = VHD_REQS_DATA;
+ for (i = 0; i < VHD_REQS_DATA; i++)
+ s->vreq_free[i] = s->vreq_list + i;
+
+ driver->info.size = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT;
+ driver->info.sector_size = VHD_SECTOR_SIZE;
+ driver->info.info = 0;
+
+ DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%"PRIu64
+ ", inf:%u)\n",
+ driver->info.size, driver->info.sector_size, driver->info.info);
+
+ if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) &&
+ !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) {
+ err = vhd_kill_footer(s);
+ if (err) {
+ DPRINTF("ERROR killing footer: %d\n", err);
+ goto fail;
+ }
+ s->writes++;
+ }
+
+ return 0;
+
+ fail:
+ vhd_free_bat(s);
+ vhd_free_bitmap_cache(s);
+ vhd_close(&s->vhd);
+ vhd_free(s);
+ return err;
+}
+
+static int
+_vhd_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+ vhd_flag_t vhd_flags = 0;
+
+ if (flags & TD_OPEN_RDONLY)
+ vhd_flags |= VHD_FLAG_OPEN_RDONLY;
+ if (flags & TD_OPEN_QUIET)
+ vhd_flags |= VHD_FLAG_OPEN_QUIET;
+ if (flags & TD_OPEN_STRICT)
+ vhd_flags |= VHD_FLAG_OPEN_STRICT;
+ if (flags & TD_OPEN_QUERY)
+ vhd_flags |= (VHD_FLAG_OPEN_QUERY |
+ VHD_FLAG_OPEN_QUIET |
+ VHD_FLAG_OPEN_RDONLY |
+ VHD_FLAG_OPEN_NO_CACHE);
+
+ /* pre-allocate for all but NFS and LVM storage */
+ if (driver->storage != TAPDISK_STORAGE_TYPE_NFS &&
+ driver->storage != TAPDISK_STORAGE_TYPE_LVM)
+ vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE;
+
+ return __vhd_open(driver, name, vhd_flags);
+}
+
+static void
+vhd_log_close(struct vhd_state *s)
+{
+ uint32_t i, allocated, full;
+
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+ return;
+
+ allocated = 0;
+ full = 0;
+
+ for (i = 0; i < s->bat.bat.entries; i++) {
+ if (bat_entry(s, i) != DD_BLK_UNUSED)
+ allocated++;
+ if (test_batmap(s, i))
+ full++;
+ }
+
+ DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n",
+ s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db);
+}
+
+static int
+_vhd_close(td_driver_t *driver)
+{
+ int err;
+ struct vhd_state *s;
+ struct vhd_bitmap *bm;
+
+ DBG(TLOG_WARN, "vhd_close\n");
+ s = (struct vhd_state *)driver->data;
+
+ /* don't write footer if tapdisk is read-only */
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY))
+ goto free;
+
+ /*
+ * write footer if:
+ * - we killed it on open (opened with strict)
+ * - we've written data since opening
+ */
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) {
+ memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t));
+ err = vhd_write_footer(&s->vhd, &s->vhd.footer);
+ memset(&s->vhd.bat, 0, sizeof(vhd_bat_t));
+
+ if (err)
+ EPRINTF("writing %s footer: %d\n", s->vhd.file, err);
+
+ if (!vhd_has_batmap(&s->vhd))
+ goto free;
+
+ err = vhd_write_batmap(&s->vhd, &s->bat.batmap);
+ if (err)
+ EPRINTF("writing %s batmap: %d\n", s->vhd.file, err);
+ }
+
+ free:
+ vhd_log_close(s);
+ vhd_free_bat(s);
+ vhd_free_bitmap_cache(s);
+ vhd_close(&s->vhd);
+ vhd_free(s);
+
+ memset(s, 0, sizeof(struct vhd_state));
+
+ return 0;
+}
+
+int
+vhd_validate_parent(td_driver_t *child_driver,
+ td_driver_t *parent_driver, td_flag_t flags)
+{
+ uint32_t status;
+ struct stat stats;
+ struct vhd_state *child = (struct vhd_state *)child_driver->data;
+ struct vhd_state *parent;
+
+ if (parent_driver->type != DISK_TYPE_VHD) {
+ if (child_driver->type != DISK_TYPE_VHD)
+ return -EINVAL;
+ if (child->vhd.footer.type != HD_TYPE_DIFF)
+ return -EINVAL;
+ if (!vhd_parent_raw(&child->vhd))
+ return -EINVAL;
+ return 0;
+ }
+
+ parent = (struct vhd_state *)parent_driver->data;
+
+ /*
+ * This check removed because of cases like:
+ * - parent VHD marked as 'hidden'
+ * - parent VHD modified during coalesce
+ */
+ /*
+ if (stat(parent->vhd.file, &stats)) {
+ DPRINTF("ERROR stating parent file %s\n", parent->vhd.file);
+ return -errno;
+ }
+
+ if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) {
+ DPRINTF("ERROR: parent file has been modified since "
+ "snapshot. Child image no longer valid.\n");
+ return -EINVAL;
+ }
+ */
+
+ if (vhd_uuid_compare(&child->vhd.header.prt_uuid, &parent->vhd.footer.uuid)) {
+ DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since "
+ "snapshot. Child image no longer valid.\n",
+ __func__, child->vhd.file, parent->vhd.file);
+ return -EINVAL;
+ }
+
+ /* TODO: compare sizes */
+
+ return 0;
+}
+
+int
+vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ int err;
+ char *parent;
+ struct vhd_state *s;
+
+ DBG(TLOG_DBG, "\n");
+ memset(id, 0, sizeof(td_disk_id_t));
+
+ s = (struct vhd_state *)driver->data;
+
+ if (s->vhd.footer.type != HD_TYPE_DIFF)
+ return TD_NO_PARENT;
+
+ err = vhd_parent_locator_get(&s->vhd, &parent);
+ if (err)
+ return err;
+
+ id->name = parent;
+ id->drivertype = DISK_TYPE_VHD;
+ if (vhd_parent_raw(&s->vhd)) {
+ DPRINTF("VHD: parent is raw\n");
+ id->drivertype = DISK_TYPE_AIO;
+ }
+ return 0;
+}
+
+static inline void
+clear_req_list(struct vhd_req_list *list)
+{
+ list->head = list->tail = NULL;
+}
+
+static inline void
+add_to_tail(struct vhd_req_list *list, struct vhd_request *e)
+{
+ if (!list->head)
+ list->head = list->tail = e;
+ else
+ list->tail = list->tail->next = e;
+}
+
+static inline int
+remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e)
+{
+ struct vhd_request *i = list->head;
+
+ if (list->head == e) {
+ if (list->tail == e)
+ clear_req_list(list);
+ else
+ list->head = list->head->next;
+ return 0;
+ }
+
+ while (i->next) {
+ if (i->next == e) {
+ if (list->tail == e) {
+ i->next = NULL;
+ list->tail = i;
+ } else
+ i->next = i->next->next;
+ return 0;
+ }
+ i = i->next;
+ }
+
+ return -EINVAL;
+}
+
+static inline void
+init_vhd_request(struct vhd_state *s, struct vhd_request *req)
+{
+ memset(req, 0, sizeof(struct vhd_request));
+ req->state = s;
+}
+
+static inline void
+init_tx(struct vhd_transaction *tx)
+{
+ memset(tx, 0, sizeof(struct vhd_transaction));
+}
+
+static inline void
+add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r)
+{
+ ASSERT(!tx->closed);
+
+ r->tx = tx;
+ tx->started++;
+ add_to_tail(&tx->requests, r);
+ set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE);
+
+ DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, "
+ "started: %d, finished: %d, status: %u\n",
+ r->treq.sec / SPB, r->treq.sec, tx,
+ tx->started, tx->finished, tx->status);
+}
+
+static inline int
+transaction_completed(struct vhd_transaction *tx)
+{
+ return (tx->started == tx->finished);
+}
+
+static inline void
+init_bat(struct vhd_state *s)
+{
+ s->bat.req.tx = NULL;
+ s->bat.req.next = NULL;
+ s->bat.req.error = 0;
+ s->bat.pbw_blk = 0;
+ s->bat.pbw_offset = 0;
+ s->bat.status = 0;
+}
+
+static inline void
+lock_bat(struct vhd_state *s)
+{
+ set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline void
+unlock_bat(struct vhd_state *s)
+{
+ clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline int
+bat_locked(struct vhd_state *s)
+{
+ return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline void
+init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ bm->blk = 0;
+ bm->seqno = 0;
+ bm->status = 0;
+ init_tx(&bm->tx);
+ clear_req_list(&bm->queue);
+ clear_req_list(&bm->waiting);
+ memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs));
+ memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs));
+ init_vhd_request(s, &bm->req);
+}
+
+static inline struct vhd_bitmap *
+get_bitmap(struct vhd_state *s, uint32_t block)
+{
+ int i;
+ struct vhd_bitmap *bm;
+
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ bm = s->bitmap[i];
+ if (bm && bm->blk == block)
+ return bm;
+ }
+
+ return NULL;
+}
+
+static inline void
+lock_bitmap(struct vhd_bitmap *bm)
+{
+ set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline void
+unlock_bitmap(struct vhd_bitmap *bm)
+{
+ clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline int
+bitmap_locked(struct vhd_bitmap *bm)
+{
+ return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline int
+bitmap_valid(struct vhd_bitmap *bm)
+{
+ return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+}
+
+static inline int
+bitmap_in_use(struct vhd_bitmap *bm)
+{
+ return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING) ||
+ test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) ||
+ test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) ||
+ bm->waiting.head || bm->tx.requests.head || bm->queue.head);
+}
+
+static inline int
+bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ int i, n;
+
+ n = s->spb >> 3;
+ for (i = 0; i < n; i++)
+ if (bm->map[i] != (char)0xFF)
+ return 0;
+
+ DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk);
+ return 1;
+}
+
+static struct vhd_bitmap *
+remove_lru_bitmap(struct vhd_state *s)
+{
+ int i, idx = 0;
+ u64 seq = s->bm_lru;
+ struct vhd_bitmap *bm, *lru = NULL;
+
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ bm = s->bitmap[i];
+ if (bm && bm->seqno < seq && !bitmap_locked(bm)) {
+ idx = i;
+ lru = bm;
+ seq = lru->seqno;
+ }
+ }
+
+ if (lru) {
+ s->bitmap[idx] = NULL;
+ ASSERT(!bitmap_in_use(lru));
+ }
+
+ return lru;
+}
+
+static int
+alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk)
+{
+ struct vhd_bitmap *bm;
+
+ *bitmap = NULL;
+
+ if (s->bm_free_count > 0) {
+ bm = s->bitmap_free[--s->bm_free_count];
+ } else {
+ bm = remove_lru_bitmap(s);
+ if (!bm)
+ return -EBUSY;
+ }
+
+ init_vhd_bitmap(s, bm);
+ bm->blk = blk;
+ *bitmap = bm;
+
+ return 0;
+}
+
+static inline uint64_t
+__bitmap_lru_seqno(struct vhd_state *s)
+{
+ int i;
+ struct vhd_bitmap *bm;
+
+ if (s->bm_lru == 0xffffffff) {
+ s->bm_lru = 0;
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ bm = s->bitmap[i];
+ if (bm) {
+ bm->seqno >>= 1;
+ if (bm->seqno > s->bm_lru)
+ s->bm_lru = bm->seqno;
+ }
+ }
+ }
+
+ return ++s->bm_lru;
+}
+
+static inline void
+touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ bm->seqno = __bitmap_lru_seqno(s);
+}
+
+static inline void
+install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ int i;
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ if (!s->bitmap[i]) {
+ touch_bitmap(s, bm);
+ s->bitmap[i] = bm;
+ return;
+ }
+ }
+
+ ASSERT(0);
+}
+
+static inline void
+free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ int i;
+
+ for (i = 0; i < VHD_CACHE_SIZE; i++)
+ if (s->bitmap[i] == bm)
+ break;
+
+ ASSERT(!bitmap_locked(bm));
+ ASSERT(!bitmap_in_use(bm));
+ ASSERT(i < VHD_CACHE_SIZE);
+
+ s->bitmap[i] = NULL;
+ s->bitmap_free[s->bm_free_count++] = bm;
+}
+
+static int
+read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op)
+{
+ u32 blk, sec;
+ struct vhd_bitmap *bm;
+
+ /* in fixed disks, every block is present */
+ if (s->vhd.footer.type == HD_TYPE_FIXED)
+ return VHD_BM_BIT_SET;
+
+ blk = sector / s->spb;
+ sec = sector % s->spb;
+
+ if (blk > s->vhd.header.max_bat_size) {
+ DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n",
+ sector, op);
+ return -EINVAL;
+ }
+
+ if (bat_entry(s, blk) == DD_BLK_UNUSED) {
+ if (op == VHD_OP_DATA_WRITE &&
+ s->bat.pbw_blk != blk && bat_locked(s))
+ return VHD_BM_BAT_LOCKED;
+
+ return VHD_BM_BAT_CLEAR;
+ }
+
+ if (test_batmap(s, blk)) {
+ DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk);
+ return VHD_BM_BIT_SET;
+ }
+
+ bm = get_bitmap(s, blk);
+ if (!bm)
+ return VHD_BM_NOT_CACHED;
+
+ /* bump lru count */
+ touch_bitmap(s, bm);
+
+ if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING))
+ return VHD_BM_READ_PENDING;
+
+ return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ?
+ VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR);
+}
+
+static int
+read_bitmap_cache_span(struct vhd_state *s,
+ uint64_t sector, int nr_secs, int value)
+{
+ int ret;
+ u32 blk, sec;
+ struct vhd_bitmap *bm;
+
+ /* in fixed disks, every block is present */
+ if (s->vhd.footer.type == HD_TYPE_FIXED)
+ return nr_secs;
+
+ sec = sector % s->spb;
+ blk = sector / s->spb;
+
+ if (test_batmap(s, blk))
+ return MIN(nr_secs, s->spb - sec);
+
+ bm = get_bitmap(s, blk);
+
+ ASSERT(bm && bitmap_valid(bm));
+
+ for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++)
+ if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value)
+ break;
+
+ return ret;
+}
+
+static inline struct vhd_request *
+alloc_vhd_request(struct vhd_state *s)
+{
+ struct vhd_request *req = NULL;
+
+ if (s->vreq_free_count > 0) {
+ req = s->vreq_free[--s->vreq_free_count];
+ ASSERT(req->treq.secs == 0);
+ init_vhd_request(s, req);
+ return req;
+ }
+
+ return NULL;
+}
+
+static inline void
+free_vhd_request(struct vhd_state *s, struct vhd_request *req)
+{
+ memset(req, 0, sizeof(struct vhd_request));
+ s->vreq_free[s->vreq_free_count++] = req;
+}
+
+static inline void
+aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
+{
+ struct tiocb *tiocb = &req->tiocb;
+
+ td_prep_read(tiocb, s->vhd.fd, req->treq.buf,
+ vhd_sectors_to_bytes(req->treq.secs),
+ offset, vhd_complete, req);
+ td_queue_tiocb(s->driver, tiocb);
+
+ s->queued++;
+ s->reads++;
+ s->read_size += req->treq.secs;
+ TRACE(s);
+}
+
+static inline void
+aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
+{
+ struct tiocb *tiocb = &req->tiocb;
+
+ td_prep_write(tiocb, s->vhd.fd, req->treq.buf,
+ vhd_sectors_to_bytes(req->treq.secs),
+ offset, vhd_complete, req);
+ td_queue_tiocb(s->driver, tiocb);
+
+ s->queued++;
+ s->writes++;
+ s->write_size += req->treq.secs;
+ TRACE(s);
+}
+
+static inline uint64_t
+reserve_new_block(struct vhd_state *s, uint32_t blk)
+{
+ int gap = 0;
+
+ ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
+
+ /* data region of segment should begin on page boundary */
+ if ((s->next_db + s->bm_secs) % s->spp)
+ gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
+
+ s->bat.pbw_blk = blk;
+ s->bat.pbw_offset = s->next_db + gap;
+
+ return s->next_db;
+}
+
+static int
+schedule_bat_write(struct vhd_state *s)
+{
+ int i;
+ u32 blk;
+ char *buf;
+ u64 offset;
+ struct vhd_request *req;
+
+ ASSERT(bat_locked(s));
+
+ req = &s->bat.req;
+ buf = s->bat.bat_buf;
+ blk = s->bat.pbw_blk;
+
+ init_vhd_request(s, req);
+ memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512);
+
+ ((u32 *)buf)[blk % 128] = s->bat.pbw_offset;
+
+ for (i = 0; i < 128; i++)
+ BE32_OUT(&((u32 *)buf)[i]);
+
+ offset = s->vhd.header.table_offset + (blk - (blk % 128)) * 4;
+ req->treq.secs = 1;
+ req->treq.buf = buf;
+ req->op = VHD_OP_BAT_WRITE;
+ req->next = NULL;
+
+ aio_write(s, req, offset);
+ set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED);
+
+ DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", "
+ "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset);
+
+ return 0;
+}
+
+static void
+schedule_zero_bm_write(struct vhd_state *s,
+ struct vhd_bitmap *bm, uint64_t lb_end)
+{
+ uint64_t offset;
+ struct vhd_request *req = &s->bat.zero_req;
+
+ init_vhd_request(s, req);
+
+ offset = vhd_sectors_to_bytes(lb_end);
+ req->op = VHD_OP_ZERO_BM_WRITE;
+ req->treq.sec = s->bat.pbw_blk * s->spb;
+ req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs;
+ req->treq.buf = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs));
+ req->next = NULL;
+
+ DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n",
+ s->bat.pbw_blk, offset);
+
+ lock_bitmap(bm);
+ add_to_transaction(&bm->tx, req);
+ aio_write(s, req, offset);
+}
+
+static int
+update_bat(struct vhd_state *s, uint32_t blk)
+{
+ int err;
+ uint64_t lb_end;
+ struct vhd_bitmap *bm;
+
+ ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
+
+ if (bat_locked(s)) {
+ ASSERT(s->bat.pbw_blk == blk);
+ return 0;
+ }
+
+ /* empty bitmap could already be in
+ * cache if earlier bat update failed */
+ bm = get_bitmap(s, blk);
+ if (!bm) {
+ /* install empty bitmap in cache */
+ err = alloc_vhd_bitmap(s, &bm, blk);
+ if (err)
+ return err;
+
+ install_bitmap(s, bm);
+ }
+
+ lock_bat(s);
+ lb_end = reserve_new_block(s, blk);
+ schedule_zero_bm_write(s, bm, lb_end);
+ set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT);
+
+ return 0;
+}
+
+static int
+allocate_block(struct vhd_state *s, uint32_t blk)
+{
+ char *zeros;
+ int err, gap;
+ uint64_t offset, size;
+ struct vhd_bitmap *bm;
+
+ ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
+
+ if (bat_locked(s)) {
+ ASSERT(s->bat.pbw_blk == blk);
+ if (s->bat.req.error)
+ return -EBUSY;
+ return 0;
+ }
+
+ gap = 0;
+ s->bat.pbw_blk = blk;
+ offset = vhd_sectors_to_bytes(s->next_db);
+
+ /* data region of segment should begin on page boundary */
+ if ((s->next_db + s->bm_secs) % s->spp) {
+ gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
+ s->next_db += gap;
+ }
+
+ s->bat.pbw_offset = s->next_db;
+
+ DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n",
+ blk, s->bat.pbw_offset);
+
+ if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) {
+ ERR(errno, "lseek failed\n");
+ return -errno;
+ }
+
+ size = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap);
+ err = write(s->vhd.fd, vhd_zeros(size), size);
+ if (err != size) {
+ err = (err == -1 ? -errno : -EIO);
+ ERR(err, "write failed");
+ return err;
+ }
+
+ /* empty bitmap could already be in
+ * cache if earlier bat update failed */
+ bm = get_bitmap(s, blk);
+ if (!bm) {
+ /* install empty bitmap in cache */
+ err = alloc_vhd_bitmap(s, &bm, blk);
+ if (err)
+ return err;
+
+ install_bitmap(s, bm);
+ }
+
+ lock_bat(s);
+ lock_bitmap(bm);
+ schedule_bat_write(s);
+ add_to_transaction(&bm->tx, &s->bat.req);
+
+ return 0;
+}
+
+static int
+schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
+{
+ u64 offset;
+ u32 blk = 0, sec = 0;
+ struct vhd_bitmap *bm;
+ struct vhd_request *req;
+
+ if (s->vhd.footer.type == HD_TYPE_FIXED) {
+ offset = vhd_sectors_to_bytes(treq.sec);
+ goto make_request;
+ }
+
+ blk = treq.sec / s->spb;
+ sec = treq.sec % s->spb;
+ bm = get_bitmap(s, blk);
+ offset = bat_entry(s, blk);
+
+ ASSERT(offset != DD_BLK_UNUSED);
+ ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm)));
+
+ offset += s->bm_secs + sec;
+ offset = vhd_sectors_to_bytes(offset);
+
+ make_request:
+ req = alloc_vhd_request(s);
+ if (!req)
+ return -EBUSY;
+
+ req->treq = treq;
+ req->flags = flags;
+ req->op = VHD_OP_DATA_READ;
+ req->next = NULL;
+
+ aio_read(s, req, offset);
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
+ "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n",
+ s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags,
+ treq.buf);
+
+ return 0;
+}
+
+static int
+schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
+{
+ int err;
+ u64 offset;
+ u32 blk = 0, sec = 0;
+ struct vhd_bitmap *bm = NULL;
+ struct vhd_request *req;
+
+ if (s->vhd.footer.type == HD_TYPE_FIXED) {
+ offset = vhd_sectors_to_bytes(treq.sec);
+ goto make_request;
+ }
+
+ blk = treq.sec / s->spb;
+ sec = treq.sec % s->spb;
+ offset = bat_entry(s, blk);
+
+ if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) {
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
+ err = allocate_block(s, blk);
+ else
+ err = update_bat(s, blk);
+
+ if (err)
+ return err;
+
+ offset = s->bat.pbw_offset;
+ }
+
+ offset += s->bm_secs + sec;
+ offset = vhd_sectors_to_bytes(offset);
+
+ make_request:
+ req = alloc_vhd_request(s);
+ if (!req)
+ return -EBUSY;
+
+ req->treq = treq;
+ req->flags = flags;
+ req->op = VHD_OP_DATA_WRITE;
+ req->next = NULL;
+
+ if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) {
+ bm = get_bitmap(s, blk);
+ ASSERT(bm && bitmap_valid(bm));
+ lock_bitmap(bm);
+
+ if (bm->tx.closed) {
+ add_to_tail(&bm->queue, req);
+ set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED);
+ } else
+ add_to_transaction(&bm->tx, req);
+ }
+
+ aio_write(s, req, offset);
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
+ "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n",
+ s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags);
+
+ return 0;
+}
+
+static int
+schedule_bitmap_read(struct vhd_state *s, uint32_t blk)
+{
+ int err;
+ u64 offset;
+ struct vhd_bitmap *bm;
+ struct vhd_request *req = NULL;
+
+ ASSERT(vhd_type_dynamic(&s->vhd));
+
+ offset = bat_entry(s, blk);
+
+ ASSERT(offset != DD_BLK_UNUSED);
+ ASSERT(!get_bitmap(s, blk));
+
+ offset = vhd_sectors_to_bytes(offset);
+
+ err = alloc_vhd_bitmap(s, &bm, blk);
+ if (err)
+ return err;
+
+ req = &bm->req;
+ init_vhd_request(s, req);
+
+ req->treq.sec = blk * s->spb;
+ req->treq.secs = s->bm_secs;
+ req->treq.buf = bm->map;
+ req->treq.cb = NULL;
+ req->op = VHD_OP_BITMAP_READ;
+ req->next = NULL;
+
+ aio_read(s, req, offset);
+ lock_bitmap(bm);
+ install_bitmap(s, bm);
+ set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, "
+ "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk,
+ req->treq.secs, offset);
+
+ return 0;
+}
+
+static void
+schedule_bitmap_write(struct vhd_state *s, uint32_t blk)
+{
+ u64 offset;
+ struct vhd_bitmap *bm;
+ struct vhd_request *req;
+
+ bm = get_bitmap(s, blk);
+ offset = bat_entry(s, blk);
+
+ ASSERT(vhd_type_dynamic(&s->vhd));
+ ASSERT(bm && bitmap_valid(bm) &&
+ !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
+
+ if (offset == DD_BLK_UNUSED) {
+ ASSERT(bat_locked(s) && s->bat.pbw_blk == blk);
+ offset = s->bat.pbw_offset;
+ }
+
+ offset = vhd_sectors_to_bytes(offset);
+
+ req = &bm->req;
+ init_vhd_request(s, req);
+
+ req->treq.sec = blk * s->spb;
+ req->treq.secs = s->bm_secs;
+ req->treq.buf = bm->shadow;
+ req->treq.cb = NULL;
+ req->op = VHD_OP_BITMAP_WRITE;
+ req->next = NULL;
+
+ aio_write(s, req, offset);
+ lock_bitmap(bm);
+ touch_bitmap(s, bm); /* bump lru count */
+ set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
+
+ DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, "
+ "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec,
+ req->treq.secs, offset);
+}
+
+/*
+ * queued requests will be submitted once the bitmap
+ * describing them is read and the requests are validated.
+ */
+static int
+__vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq)
+{
+ u32 blk;
+ struct vhd_bitmap *bm;
+ struct vhd_request *req;
+
+ ASSERT(vhd_type_dynamic(&s->vhd));
+
+ blk = treq.sec / s->spb;
+ bm = get_bitmap(s, blk);
+
+ ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
+
+ req = alloc_vhd_request(s);
+ if (!req)
+ return -EBUSY;
+
+ req->treq = treq;
+ req->op = op;
+ req->next = NULL;
+
+ add_to_tail(&bm->waiting, req);
+ lock_bitmap(bm);
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, "
+ "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op);
+
+ TRACE(s);
+ return 0;
+}
+
+static void
+vhd_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct vhd_state *s = (struct vhd_state *)driver->data;
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n",
+ s->vhd.file, treq.sec, treq.secs, treq.sidx);
+
+ while (treq.secs) {
+ int err;
+ td_request_t clone;
+
+ err = 0;
+ clone = treq;
+
+ switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) {
+ case -EINVAL:
+ err = -EINVAL;
+ goto fail;
+
+ case VHD_BM_BAT_CLEAR:
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ td_forward_request(clone);
+ break;
+
+ case VHD_BM_BIT_CLEAR:
+ clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
+ td_forward_request(clone);
+ break;
+
+ case VHD_BM_BIT_SET:
+ clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
+ err = schedule_data_read(s, clone, 0);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_NOT_CACHED:
+ err = schedule_bitmap_read(s, clone.sec / s->spb);
+ if (err)
+ goto fail;
+
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_READ_PENDING:
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_BAT_LOCKED:
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ treq.sec += clone.secs;
+ treq.secs -= clone.secs;
+ treq.buf += vhd_sectors_to_bytes(clone.secs);
+ continue;
+
+ fail:
+ clone.secs = treq.secs;
+ td_complete_request(clone, err);
+ break;
+ }
+}
+
+static void
+vhd_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct vhd_state *s = (struct vhd_state *)driver->data;
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x, (seg: %d)\n",
+ s->vhd.file, treq.sec, treq.secs, treq.sidx);
+
+ while (treq.secs) {
+ int err;
+ uint8_t flags;
+ td_request_t clone;
+
+ err = 0;
+ flags = 0;
+ clone = treq;
+
+ switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_WRITE)) {
+ case -EINVAL:
+ err = -EINVAL;
+ goto fail;
+
+ case VHD_BM_BAT_LOCKED:
+ err = -EBUSY;
+ clone.blocked = 1;
+ goto fail;
+
+ case VHD_BM_BAT_CLEAR:
+ flags = (VHD_FLAG_REQ_UPDATE_BAT |
+ VHD_FLAG_REQ_UPDATE_BITMAP);
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ err = schedule_data_write(s, clone, flags);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_BIT_CLEAR:
+ flags = VHD_FLAG_REQ_UPDATE_BITMAP;
+ clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
+ err = schedule_data_write(s, clone, flags);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_BIT_SET:
+ clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
+ err = schedule_data_write(s, clone, 0);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_NOT_CACHED:
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ err = schedule_bitmap_read(s, clone.sec / s->spb);
+ if (err)
+ goto fail;
+
+ err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_READ_PENDING:
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
+ if (err)
+ goto fail;
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ treq.sec += clone.secs;
+ treq.secs -= clone.secs;
+ treq.buf += vhd_sectors_to_bytes(clone.secs);
+ continue;
+
+ fail:
+ clone.secs = treq.secs;
+ td_complete_request(clone, err);
+ break;
+ }
+}
+
+static inline void
+signal_completion(struct vhd_request *list, int error)
+{
+ struct vhd_state *s;
+ struct vhd_request *r, *next;
+
+ if (!list)
+ return;
+
+ r = list;
+ s = list->state;
+
+ while (r) {
+ int err;
+
+ err = (error ? error : r->error);
+ next = r->next;
+ td_complete_request(r->treq, err);
+ DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64", "
+ "err: %d\n", r->treq.sec, r->treq.sec / s->spb, err);
+ free_vhd_request(s, r);
+ r = next;
+
+ s->returned++;
+ TRACE(s);
+ }
+}
+
+static void
+start_new_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ int i, error = 0;
+ struct vhd_transaction *tx;
+ struct vhd_request *r, *next;
+
+ if (!bm->queue.head)
+ return;
+
+ DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+
+ r = bm->queue.head;
+ tx = &bm->tx;
+ clear_req_list(&bm->queue);
+
+ if (r && bat_entry(s, bm->blk) == DD_BLK_UNUSED)
+ tx->error = -EIO;
+
+ while (r) {
+ next = r->next;
+ r->next = NULL;
+ clear_vhd_flag(r->flags, VHD_FLAG_REQ_QUEUED);
+
+ add_to_transaction(tx, r);
+ if (test_vhd_flag(r->flags, VHD_FLAG_REQ_FINISHED)) {
+ tx->finished++;
+ if (!r->error) {
+ u32 sec = r->treq.sec % s->spb;
+ for (i = 0; i < r->treq.secs; i++)
+ vhd_bitmap_set(&s->vhd,
+ bm->shadow, sec + i);
+ }
+ }
+ r = next;
+ }
+
+ /* perhaps all the queued writes already completed? */
+ if (tx->started && transaction_completed(tx))
+ finish_data_transaction(s, bm);
+}
+
+static void
+finish_bat_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ struct vhd_transaction *tx = &bm->tx;
+
+ if (!bat_locked(s))
+ return;
+
+ if (s->bat.pbw_blk != bm->blk)
+ return;
+
+ if (!s->bat.req.error)
+ goto release;
+
+ if (!test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE))
+ goto release;
+
+ tx->closed = 1;
+ return;
+
+ release:
+ DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+ unlock_bat(s);
+ init_bat(s);
+}
+
+static void
+finish_bitmap_transaction(struct vhd_state *s,
+ struct vhd_bitmap *bm, int error)
+{
+ int map_size;
+ struct vhd_transaction *tx = &bm->tx;
+
+ DBG(TLOG_DBG, "blk: 0x%04x, err: %d\n", bm->blk, error);
+ tx->error = (tx->error ? tx->error : error);
+ map_size = vhd_sectors_to_bytes(s->bm_secs);
+
+ if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
+ if (test_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT)) {
+ /* still waiting for bat write */
+ ASSERT(bm->blk == s->bat.pbw_blk);
+ ASSERT(test_vhd_flag(s->bat.status,
+ VHD_FLAG_BAT_WRITE_STARTED));
+ s->bat.req.tx = tx;
+ return;
+ }
+ }
+
+ if (tx->error) {
+ /* undo changes to shadow */
+ memcpy(bm->shadow, bm->map, map_size);
+ } else {
+ /* complete atomic write */
+ memcpy(bm->map, bm->shadow, map_size);
+ if (!test_batmap(s, bm->blk) && bitmap_full(s, bm))
+ set_batmap(s, bm->blk);
+ }
+
+ /* transaction done; signal completions */
+ signal_completion(tx->requests.head, tx->error);
+ init_tx(tx);
+ start_new_bitmap_transaction(s, bm);
+
+ if (!bitmap_in_use(bm))
+ unlock_bitmap(bm);
+
+ finish_bat_transaction(s, bm);
+}
+
+static void
+finish_data_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ struct vhd_transaction *tx = &bm->tx;
+
+ DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+
+ tx->closed = 1;
+
+ if (!tx->error)
+ return schedule_bitmap_write(s, bm->blk);
+
+ return finish_bitmap_transaction(s, bm, 0);
+}
+
+static void
+finish_bat_write(struct vhd_request *req)
+{
+ struct vhd_bitmap *bm;
+ struct vhd_transaction *tx;
+ struct vhd_state *s = req->state;
+
+ s->returned++;
+ TRACE(s);
+
+ bm = get_bitmap(s, s->bat.pbw_blk);
+
+ DBG(TLOG_DBG, "blk 0x%04x, pbwo: 0x%08"PRIx64", err %d\n",
+ s->bat.pbw_blk, s->bat.pbw_offset, req->error);
+ ASSERT(bm && bitmap_valid(bm));
+ ASSERT(bat_locked(s) &&
+ test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
+
+ tx = &bm->tx;
+ ASSERT(test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE));
+
+ if (!req->error) {
+ bat_entry(s, s->bat.pbw_blk) = s->bat.pbw_offset;
+ s->next_db = s->bat.pbw_offset + s->spb + s->bm_secs;
+ } else
+ tx->error = req->error;
+
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
+ tx->finished++;
+ remove_from_req_list(&tx->requests, req);
+ if (transaction_completed(tx))
+ finish_data_transaction(s, bm);
+ } else {
+ clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
+ if (s->bat.req.tx)
+ finish_bitmap_transaction(s, bm, req->error);
+ }
+
+ finish_bat_transaction(s, bm);
+}
+
+static void
+finish_zero_bm_write(struct vhd_request *req)
+{
+ u32 blk;
+ struct vhd_bitmap *bm;
+ struct vhd_transaction *tx = req->tx;
+ struct vhd_state *s = req->state;
+
+ s->returned++;
+ TRACE(s);
+
+ blk = req->treq.sec / s->spb;
+ bm = get_bitmap(s, blk);
+
+ DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
+ ASSERT(bat_locked(s));
+ ASSERT(s->bat.pbw_blk == blk);
+ ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
+
+ tx->finished++;
+ remove_from_req_list(&tx->requests, req);
+
+ if (req->error) {
+ unlock_bat(s);
+ init_bat(s);
+ tx->error = req->error;
+ clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
+ } else
+ schedule_bat_write(s);
+
+ if (transaction_completed(tx))
+ finish_data_transaction(s, bm);
+}
+
+static void
+finish_bitmap_read(struct vhd_request *req)
+{
+ u32 blk;
+ struct vhd_bitmap *bm;
+ struct vhd_request *r, *next;
+ struct vhd_state *s = req->state;
+
+ s->returned++;
+ TRACE(s);
+
+ blk = req->treq.sec / s->spb;
+ bm = get_bitmap(s, blk);
+
+ DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
+ ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
+
+ r = bm->waiting.head;
+ clear_req_list(&bm->waiting);
+ clear_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+
+ if (!req->error) {
+ memcpy(bm->shadow, bm->map, vhd_sectors_to_bytes(s->bm_secs));
+
+ while (r) {
+ struct vhd_request tmp;
+
+ tmp = *r;
+ next = r->next;
+ free_vhd_request(s, r);
+
+ ASSERT(tmp.op == VHD_OP_DATA_READ ||
+ tmp.op == VHD_OP_DATA_WRITE);
+
+ if (tmp.op == VHD_OP_DATA_READ)
+ vhd_queue_read(s->driver, tmp.treq);
+ else if (tmp.op == VHD_OP_DATA_WRITE)
+ vhd_queue_write(s->driver, tmp.treq);
+
+ r = next;
+ }
+ } else {
+ int err = req->error;
+ unlock_bitmap(bm);
+ free_vhd_bitmap(s, bm);
+ return signal_completion(r, err);
+ }
+
+ if (!bitmap_in_use(bm))
+ unlock_bitmap(bm);
+}
+
+static void
+finish_bitmap_write(struct vhd_request *req)
+{
+ u32 blk;
+ struct vhd_bitmap *bm;
+ struct vhd_transaction *tx;
+ struct vhd_state *s = req->state;
+
+ s->returned++;
+ TRACE(s);
+
+ blk = req->treq.sec / s->spb;
+ bm = get_bitmap(s, blk);
+ tx = &bm->tx;
+
+ DBG(TLOG_DBG, "blk: 0x%04x, started: %d, finished: %d\n",
+ blk, tx->started, tx->finished);
+ ASSERT(tx->closed);
+ ASSERT(bm && bitmap_valid(bm));
+ ASSERT(test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
+
+ clear_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
+
+ finish_bitmap_transaction(s, bm, req->error);
+}
+
+static void
+finish_data_read(struct vhd_request *req)
+{
+ struct vhd_state *s = req->state;
+
+ DBG(TLOG_DBG, "lsec 0x%08"PRIx64", blk: 0x%04"PRIx64"\n",
+ req->treq.sec, req->treq.sec / s->spb);
+ signal_completion(req, 0);
+}
+
+static void
+finish_data_write(struct vhd_request *req)
+{
+ int i;
+ struct vhd_transaction *tx = req->tx;
+ struct vhd_state *s = (struct vhd_state *)req->state;
+
+ set_vhd_flag(req->flags, VHD_FLAG_REQ_FINISHED);
+
+ if (tx) {
+ u32 blk, sec;
+ struct vhd_bitmap *bm;
+
+ blk = req->treq.sec / s->spb;
+ sec = req->treq.sec % s->spb;
+ bm = get_bitmap(s, blk);
+
+ ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
+
+ tx->finished++;
+
+ DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x04%"PRIx64", "
+ "tx->started: %d, tx->finished: %d\n", req->treq.sec,
+ req->treq.sec / s->spb, tx->started, tx->finished);
+
+ if (!req->error)
+ for (i = 0; i < req->treq.secs; i++)
+ vhd_bitmap_set(&s->vhd, bm->shadow, sec + i);
+
+ if (transaction_completed(tx))
+ finish_data_transaction(s, bm);
+
+ } else if (!test_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED)) {
+ ASSERT(!req->next);
+ DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64"\n",
+ req->treq.sec, req->treq.sec / s->spb);
+ signal_completion(req, 0);
+ }
+}
+
+void
+vhd_complete(void *arg, struct tiocb *tiocb, int err)
+{
+ struct vhd_request *req = (struct vhd_request *)arg;
+ struct vhd_state *s = req->state;
+ struct iocb *io = &tiocb->iocb;
+
+ s->completed++;
+ TRACE(s);
+
+ req->error = err;
+
+ if (req->error)
+ ERR(req->error, "%s: op: %u, lsec: %"PRIu64", secs: %u, "
+ "nbytes: %lu, blk: %"PRIu64", blk_offset: %u",
+ s->vhd.file, req->op, req->treq.sec, req->treq.secs,
+ io->u.c.nbytes, req->treq.sec / s->spb,
+ bat_entry(s, req->treq.sec / s->spb));
+
+ switch (req->op) {
+ case VHD_OP_DATA_READ:
+ finish_data_read(req);
+ break;
+
+ case VHD_OP_DATA_WRITE:
+ finish_data_write(req);
+ break;
+
+ case VHD_OP_BITMAP_READ:
+ finish_bitmap_read(req);
+ break;
+
+ case VHD_OP_BITMAP_WRITE:
+ finish_bitmap_write(req);
+ break;
+
+ case VHD_OP_ZERO_BM_WRITE:
+ finish_zero_bm_write(req);
+ break;
+
+ case VHD_OP_BAT_WRITE:
+ finish_bat_write(req);
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+void
+vhd_debug(td_driver_t *driver)
+{
+ int i;
+ struct vhd_state *s = (struct vhd_state *)driver->data;
+
+ DBG(TLOG_WARN, "%s: QUEUED: 0x%08"PRIx64", COMPLETED: 0x%08"PRIx64", "
+ "RETURNED: 0x%08"PRIx64"\n", s->vhd.file, s->queued, s->completed,
+ s->returned);
+ DBG(TLOG_WARN, "WRITES: 0x%08"PRIx64", AVG_WRITE_SIZE: %f\n",
+ s->writes, (s->writes ? ((float)s->write_size / s->writes) : 0.0));
+ DBG(TLOG_WARN, "READS: 0x%08"PRIx64", AVG_READ_SIZE: %f\n",
+ s->reads, (s->reads ? ((float)s->read_size / s->reads) : 0.0));
+
+ DBG(TLOG_WARN, "ALLOCATED REQUESTS: (%lu total)\n", VHD_REQS_DATA);
+ for (i = 0; i < VHD_REQS_DATA; i++) {
+ struct vhd_request *r = &s->vreq_list[i];
+ td_request_t *t = &r->treq;
+ if (t->secs)
+ DBG(TLOG_WARN, "%d: id: 0x%04"PRIx64", err: %d, op: %d,"
+ " lsec: 0x%08"PRIx64", flags: %d, this: %p, "
+ "next: %p, tx: %p\n", i, t->id, r->error, r->op,
+ t->sec, r->flags, r, r->next, r->tx);
+ }
+
+ DBG(TLOG_WARN, "BITMAP CACHE:\n");
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ int qnum = 0, wnum = 0, rnum = 0;
+ struct vhd_bitmap *bm = s->bitmap[i];
+ struct vhd_transaction *tx;
+ struct vhd_request *r;
+
+ if (!bm)
+ continue;
+
+ tx = &bm->tx;
+ r = bm->queue.head;
+ while (r) {
+ qnum++;
+ r = r->next;
+ }
+
+ r = bm->waiting.head;
+ while (r) {
+ wnum++;
+ r = r->next;
+ }
+
+ r = tx->requests.head;
+ while (r) {
+ rnum++;
+ r = r->next;
+ }
+
+ DBG(TLOG_WARN, "%d: blk: 0x%04x, status: 0x%08x, q: %p, qnum: %d, w: %p, "
+ "wnum: %d, locked: %d, in use: %d, tx: %p, tx_error: %d, "
+ "started: %d, finished: %d, status: %u, reqs: %p, nreqs: %d\n",
+ i, bm->blk, bm->status, bm->queue.head, qnum, bm->waiting.head,
+ wnum, bitmap_locked(bm), bitmap_in_use(bm), tx, tx->error,
+ tx->started, tx->finished, tx->status, tx->requests.head, rnum);
+ }
+
+ DBG(TLOG_WARN, "BAT: status: 0x%08x, pbw_blk: 0x%04x, "
+ "pbw_off: 0x%08"PRIx64", tx: %p\n", s->bat.status, s->bat.pbw_blk,
+ s->bat.pbw_offset, s->bat.req.tx);
+
+/*
+ for (i = 0; i < s->hdr.max_bat_size; i++)
+ DPRINTF("%d: %u\n", i, s->bat.bat[i]);
+*/
+}
+
+struct tap_disk tapdisk_vhd = {
+ .disk_type = "tapdisk_vhd",
+ .flags = 0,
+ .private_data_size = sizeof(struct vhd_state),
+ .td_open = _vhd_open,
+ .td_close = _vhd_close,
+ .td_queue_read = vhd_queue_read,
+ .td_queue_write = vhd_queue_write,
+ .td_get_parent_id = vhd_get_parent_id,
+ .td_validate_parent = vhd_validate_parent,
+ .td_debug = vhd_debug,
+};
--- /dev/null
+#ifndef BSWAP_H
+#define BSWAP_H
+
+//#include "config-host.h"
+
+#include <inttypes.h>
+
+#if defined(__NetBSD__)
+#include <sys/endian.h>
+#include <sys/types.h>
+#elif defined(__OpenBSD__)
+#include <machine/endian.h>
+#define bswap_16(x) swap16(x)
+#define bswap_32(x) swap32(x)
+#define bswap_64(x) swap64(x)
+#elif defined(__linux__)
+
+#include <endian.h>
+#include <byteswap.h>
+
+static inline uint16_t bswap16(uint16_t x)
+{
+ return bswap_16(x);
+}
+
+static inline uint32_t bswap32(uint32_t x)
+{
+ return bswap_32(x);
+}
+
+static inline uint64_t bswap64(uint64_t x)
+{
+ return bswap_64(x);
+}
+
+static inline void bswap16s(uint16_t *s)
+{
+ *s = bswap16(*s);
+}
+
+static inline void bswap32s(uint32_t *s)
+{
+ *s = bswap32(*s);
+}
+
+static inline void bswap64s(uint64_t *s)
+{
+ *s = bswap64(*s);
+}
+
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define be_bswap(v, size) (v)
+#define le_bswap(v, size) bswap ## size(v)
+#define be_bswaps(v, size)
+#define le_bswaps(p, size) *p = bswap ## size(*p);
+#else
+#define le_bswap(v, size) (v)
+#define be_bswap(v, size) bswap ## size(v)
+#define le_bswaps(v, size)
+#define be_bswaps(p, size) *p = bswap ## size(*p);
+#endif
+
+#define CPU_CONVERT(endian, size, type)\
+static inline type endian ## size ## _to_cpu(type v)\
+{\
+ return endian ## _bswap(v, size);\
+}\
+\
+static inline type cpu_to_ ## endian ## size(type v)\
+{\
+ return endian ## _bswap(v, size);\
+}\
+\
+static inline void endian ## size ## _to_cpus(type *p)\
+{\
+ endian ## _bswaps(p, size)\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## s(type *p)\
+{\
+ endian ## _bswaps(p, size)\
+}\
+\
+static inline type endian ## size ## _to_cpup(const type *p)\
+{\
+ return endian ## size ## _to_cpu(*p);\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## w(type *p, type v)\
+{\
+ *p = cpu_to_ ## endian ## size(v);\
+}
+
+CPU_CONVERT(be, 16, uint16_t)
+CPU_CONVERT(be, 32, uint32_t)
+CPU_CONVERT(be, 64, uint64_t)
+
+CPU_CONVERT(le, 16, uint16_t)
+CPU_CONVERT(le, 32, uint32_t)
+CPU_CONVERT(le, 64, uint64_t)
+
+/* unaligned versions (optimized for frequent unaligned accesses)*/
+
+#if defined(__i386__) || defined(__powerpc__)
+
+#define cpu_to_le16wu(p, v) cpu_to_le16w(p, v)
+#define cpu_to_le32wu(p, v) cpu_to_le32w(p, v)
+#define le16_to_cpupu(p) le16_to_cpup(p)
+#define le32_to_cpupu(p) le32_to_cpup(p)
+
+#define cpu_to_be16wu(p, v) cpu_to_be16w(p, v)
+#define cpu_to_be32wu(p, v) cpu_to_be32w(p, v)
+
+#else
+
+static inline void cpu_to_le16wu(uint16_t *p, uint16_t v)
+{
+ uint8_t *p1 = (uint8_t *)p;
+
+ p1[0] = v;
+ p1[1] = v >> 8;
+}
+
+static inline void cpu_to_le32wu(uint32_t *p, uint32_t v)
+{
+ uint8_t *p1 = (uint8_t *)p;
+
+ p1[0] = v;
+ p1[1] = v >> 8;
+ p1[2] = v >> 16;
+ p1[3] = v >> 24;
+}
+
+static inline uint16_t le16_to_cpupu(const uint16_t *p)
+{
+ const uint8_t *p1 = (const uint8_t *)p;
+ return p1[0] | (p1[1] << 8);
+}
+
+static inline uint32_t le32_to_cpupu(const uint32_t *p)
+{
+ const uint8_t *p1 = (const uint8_t *)p;
+ return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24);
+}
+
+static inline void cpu_to_be16wu(uint16_t *p, uint16_t v)
+{
+ uint8_t *p1 = (uint8_t *)p;
+
+ p1[0] = v >> 8;
+ p1[1] = v;
+}
+
+static inline void cpu_to_be32wu(uint32_t *p, uint32_t v)
+{
+ uint8_t *p1 = (uint8_t *)p;
+
+ p1[0] = v >> 24;
+ p1[1] = v >> 16;
+ p1[2] = v >> 8;
+ p1[3] = v;
+}
+
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define cpu_to_32wu cpu_to_be32wu
+#else
+#define cpu_to_32wu cpu_to_le32wu
+#endif
+
+#undef le_bswap
+#undef be_bswap
+#undef le_bswaps
+#undef be_bswaps
+
+#endif /* BSWAP_H */
--- /dev/null
+#!/bin/sh
+
+cat > .gcrypt.c << EOF
+#include <gcrypt.h>
+int main(void)
+{
+ gcry_md_hash_buffer(GCRY_MD_MD5, NULL, NULL, 0);
+ return 0;
+}
+EOF
+
+if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then
+ echo "yes"
+else
+ echo "no"
+fi
+
+rm -f .gcrypt*
--- /dev/null
+/* Copyright (C) 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+/*
+ * There are duplicates of this code in:
+ * - tools/xenstore/hashtable.c
+ */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/*
+Credit for primes table: Aaron Krowne
+ http://br.endernet.org/~akrowne/
+ http://planetmath.org/encyclopedia/GoodHashTablePrimes.html
+*/
+static const unsigned int primes[] = {
+ 53, 97, 193, 389,
+ 769, 1543, 3079, 6151,
+ 12289, 24593, 49157, 98317,
+ 196613, 393241, 786433, 1572869,
+ 3145739, 6291469, 12582917, 25165843,
+ 50331653, 100663319, 201326611, 402653189,
+ 805306457, 1610612741
+};
+const unsigned int prime_table_length = sizeof(primes)/sizeof(primes[0]);
+const float max_load_factor = 0.65;
+
+/*****************************************************************************/
+struct hashtable *
+create_hashtable(unsigned int minsize,
+ unsigned int (*hashf) (void*),
+ int (*eqf) (void*,void*))
+{
+ struct hashtable *h;
+ unsigned int pindex, size = primes[0];
+ /* Check requested hashtable isn't too large */
+ if (minsize > (1u << 30)) return NULL;
+ /* Enforce size as prime */
+ for (pindex=0; pindex < prime_table_length; pindex++) {
+ if (primes[pindex] > minsize) { size = primes[pindex]; break; }
+ }
+ h = (struct hashtable *)malloc(sizeof(struct hashtable));
+ if (NULL == h) return NULL; /*oom*/
+ h->table = (struct entry **)malloc(sizeof(struct entry*) * size);
+ if (NULL == h->table) { free(h); return NULL; } /*oom*/
+ memset(h->table, 0, size * sizeof(struct entry *));
+ h->tablelength = size;
+ h->primeindex = pindex;
+ h->entrycount = 0;
+ h->hashfn = hashf;
+ h->eqfn = eqf;
+ h->loadlimit = (unsigned int) ceil(size * max_load_factor);
+ return h;
+}
+
+/*****************************************************************************/
+unsigned int
+hash(struct hashtable *h, void *k)
+{
+ /* Aim to protect against poor hash functions by adding logic here
+ * - logic taken from java 1.4 hashtable source */
+ unsigned int i = h->hashfn(k);
+ i += ~(i << 9);
+ i ^= ((i >> 14) | (i << 18)); /* >>> */
+ i += (i << 4);
+ i ^= ((i >> 10) | (i << 22)); /* >>> */
+ return i;
+}
+
+/*****************************************************************************/
+static int
+hashtable_expand(struct hashtable *h)
+{
+ /* Double the size of the table to accomodate more entries */
+ struct entry **newtable;
+ struct entry *e;
+ struct entry **pE;
+ unsigned int newsize, i, index;
+ /* Check we're not hitting max capacity */
+ if (h->primeindex == (prime_table_length - 1)) return 0;
+ newsize = primes[++(h->primeindex)];
+
+ newtable = (struct entry **)malloc(sizeof(struct entry*) * newsize);
+ if (NULL != newtable)
+ {
+ memset(newtable, 0, newsize * sizeof(struct entry *));
+ /* This algorithm is not 'stable'. ie. it reverses the list
+ * when it transfers entries between the tables */
+ for (i = 0; i < h->tablelength; i++) {
+ while (NULL != (e = h->table[i])) {
+ h->table[i] = e->next;
+ index = indexFor(newsize,e->h);
+ e->next = newtable[index];
+ newtable[index] = e;
+ }
+ }
+ free(h->table);
+ h->table = newtable;
+ }
+ /* Plan B: realloc instead */
+ else
+ {
+ newtable = (struct entry **)
+ realloc(h->table, newsize * sizeof(struct entry *));
+ if (NULL == newtable) { (h->primeindex)--; return 0; }
+ h->table = newtable;
+ memset(newtable[h->tablelength], 0, newsize - h->tablelength);
+ for (i = 0; i < h->tablelength; i++) {
+ for (pE = &(newtable[i]), e = *pE; e != NULL; e = *pE) {
+ index = indexFor(newsize,e->h);
+ if (index == i)
+ {
+ pE = &(e->next);
+ }
+ else
+ {
+ *pE = e->next;
+ e->next = newtable[index];
+ newtable[index] = e;
+ }
+ }
+ }
+ }
+ h->tablelength = newsize;
+ h->loadlimit = (unsigned int) ceil(newsize * max_load_factor);
+ return -1;
+}
+
+/*****************************************************************************/
+unsigned int
+hashtable_count(struct hashtable *h)
+{
+ return h->entrycount;
+}
+
+/*****************************************************************************/
+int
+hashtable_insert(struct hashtable *h, void *k, void *v)
+{
+ /* This method allows duplicate keys - but they shouldn't be used */
+ unsigned int index;
+ struct entry *e;
+ if (++(h->entrycount) > h->loadlimit)
+ {
+ /* Ignore the return value. If expand fails, we should
+ * still try cramming just this value into the existing table
+ * -- we may not have memory for a larger table, but one more
+ * element may be ok. Next time we insert, we'll try expanding again.*/
+ hashtable_expand(h);
+ }
+ e = (struct entry *)malloc(sizeof(struct entry));
+ if (NULL == e) { --(h->entrycount); return 0; } /*oom*/
+ e->h = hash(h,k);
+ index = indexFor(h->tablelength,e->h);
+ e->k = k;
+ e->v = v;
+ e->next = h->table[index];
+ h->table[index] = e;
+ return -1;
+}
+
+/*****************************************************************************/
+void * /* returns value associated with key */
+hashtable_search(struct hashtable *h, void *k)
+{
+ struct entry *e;
+ unsigned int hashvalue, index;
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hashvalue);
+ e = h->table[index];
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k))) return e->v;
+ e = e->next;
+ }
+ return NULL;
+}
+
+/*****************************************************************************/
+void * /* returns value associated with key */
+hashtable_remove(struct hashtable *h, void *k)
+{
+ /* TODO: consider compacting the table when the load factor drops enough,
+ * or provide a 'compact' method. */
+
+ struct entry *e;
+ struct entry **pE;
+ void *v;
+ unsigned int hashvalue, index;
+
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hash(h,k));
+ pE = &(h->table[index]);
+ e = *pE;
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+ {
+ *pE = e->next;
+ h->entrycount--;
+ v = e->v;
+ freekey(e->k);
+ free(e);
+ return v;
+ }
+ pE = &(e->next);
+ e = e->next;
+ }
+ return NULL;
+}
+
+/*****************************************************************************/
+/* destroy */
+void
+hashtable_destroy(struct hashtable *h, int free_values)
+{
+ unsigned int i;
+ struct entry *e, *f;
+ struct entry **table = h->table;
+ if (free_values)
+ {
+ for (i = 0; i < h->tablelength; i++)
+ {
+ e = table[i];
+ while (NULL != e)
+ { f = e; e = e->next; freekey(f->k); free(f->v); free(f); }
+ }
+ }
+ else
+ {
+ for (i = 0; i < h->tablelength; i++)
+ {
+ e = table[i];
+ while (NULL != e)
+ { f = e; e = e->next; freekey(f->k); free(f); }
+ }
+ }
+ free(h->table);
+ free(h);
+}
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
--- /dev/null
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+/*
+ * There are duplicates of this code in:
+ * - tools/xenstore/hashtable.h
+ */
+
+#ifndef __HASHTABLE_CWC22_H__
+#define __HASHTABLE_CWC22_H__
+
+struct hashtable;
+
+/* Example of use:
+ *
+ * struct hashtable *h;
+ * struct some_key *k;
+ * struct some_value *v;
+ *
+ * static unsigned int hash_from_key_fn( void *k );
+ * static int keys_equal_fn ( void *key1, void *key2 );
+ *
+ * h = create_hashtable(16, hash_from_key_fn, keys_equal_fn);
+ * k = (struct some_key *) malloc(sizeof(struct some_key));
+ * v = (struct some_value *) malloc(sizeof(struct some_value));
+ *
+ * (initialise k and v to suitable values)
+ *
+ * if (! hashtable_insert(h,k,v) )
+ * { exit(-1); }
+ *
+ * if (NULL == (found = hashtable_search(h,k) ))
+ * { printf("not found!"); }
+ *
+ * if (NULL == (found = hashtable_remove(h,k) ))
+ * { printf("Not found\n"); }
+ *
+ */
+
+/* Macros may be used to define type-safe(r) hashtable access functions, with
+ * methods specialized to take known key and value types as parameters.
+ *
+ * Example:
+ *
+ * Insert this at the start of your file:
+ *
+ * DEFINE_HASHTABLE_INSERT(insert_some, struct some_key, struct some_value);
+ * DEFINE_HASHTABLE_SEARCH(search_some, struct some_key, struct some_value);
+ * DEFINE_HASHTABLE_REMOVE(remove_some, struct some_key, struct some_value);
+ *
+ * This defines the functions 'insert_some', 'search_some' and 'remove_some'.
+ * These operate just like hashtable_insert etc., with the same parameters,
+ * but their function signatures have 'struct some_key *' rather than
+ * 'void *', and hence can generate compile time errors if your program is
+ * supplying incorrect data as a key (and similarly for value).
+ *
+ * Note that the hash and key equality functions passed to create_hashtable
+ * still take 'void *' parameters instead of 'some key *'. This shouldn't be
+ * a difficult issue as they're only defined and passed once, and the other
+ * functions will ensure that only valid keys are supplied to them.
+ *
+ * The cost for this checking is increased code size and runtime overhead
+ * - if performance is important, it may be worth switching back to the
+ * unsafe methods once your program has been debugged with the safe methods.
+ * This just requires switching to some simple alternative defines - eg:
+ * #define insert_some hashtable_insert
+ *
+ */
+
+/*****************************************************************************
+ * create_hashtable
+
+ * @name create_hashtable
+ * @param minsize minimum initial size of hashtable
+ * @param hashfunction function for hashing keys
+ * @param key_eq_fn function for determining key equality
+ * @return newly created hashtable or NULL on failure
+ */
+
+struct hashtable *
+create_hashtable(unsigned int minsize,
+ unsigned int (*hashfunction) (void*),
+ int (*key_eq_fn) (void*,void*));
+
+/*****************************************************************************
+ * hashtable_insert
+
+ * @name hashtable_insert
+ * @param h the hashtable to insert into
+ * @param k the key - hashtable claims ownership and will free on removal
+ * @param v the value - does not claim ownership
+ * @return non-zero for successful insertion
+ *
+ * This function will cause the table to expand if the insertion would take
+ * the ratio of entries to table size over the maximum load factor.
+ *
+ * This function does not check for repeated insertions with a duplicate key.
+ * The value returned when using a duplicate key is undefined -- when
+ * the hashtable changes size, the order of retrieval of duplicate key
+ * entries is reversed.
+ * If in doubt, remove before insert.
+ */
+
+int
+hashtable_insert(struct hashtable *h, void *k, void *v);
+
+#define DEFINE_HASHTABLE_INSERT(fnname, keytype, valuetype) \
+int fnname (struct hashtable *h, keytype *k, valuetype *v) \
+{ \
+ return hashtable_insert(h,k,v); \
+}
+
+/*****************************************************************************
+ * hashtable_search
+
+ * @name hashtable_search
+ * @param h the hashtable to search
+ * @param k the key to search for - does not claim ownership
+ * @return the value associated with the key, or NULL if none found
+ */
+
+void *
+hashtable_search(struct hashtable *h, void *k);
+
+#define DEFINE_HASHTABLE_SEARCH(fnname, keytype, valuetype) \
+valuetype * fnname (struct hashtable *h, keytype *k) \
+{ \
+ return (valuetype *) (hashtable_search(h,k)); \
+}
+
+/*****************************************************************************
+ * hashtable_remove
+
+ * @name hashtable_remove
+ * @param h the hashtable to remove the item from
+ * @param k the key to search for - does not claim ownership
+ * @return the value associated with the key, or NULL if none found
+ */
+
+void * /* returns value */
+hashtable_remove(struct hashtable *h, void *k);
+
+#define DEFINE_HASHTABLE_REMOVE(fnname, keytype, valuetype) \
+valuetype * fnname (struct hashtable *h, keytype *k) \
+{ \
+ return (valuetype *) (hashtable_remove(h,k)); \
+}
+
+
+/*****************************************************************************
+ * hashtable_count
+
+ * @name hashtable_count
+ * @param h the hashtable
+ * @return the number of items stored in the hashtable
+ */
+unsigned int
+hashtable_count(struct hashtable *h);
+
+
+/*****************************************************************************
+ * hashtable_destroy
+
+ * @name hashtable_destroy
+ * @param h the hashtable
+ * @param free_values whether to call 'free' on the remaining values
+ */
+
+void
+hashtable_destroy(struct hashtable *h, int free_values);
+
+#endif /* __HASHTABLE_CWC22_H__ */
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
--- /dev/null
+/* Copyright (C) 2002, 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include "hashtable_itr.h"
+#include <stdlib.h> /* defines NULL */
+
+struct hashtable_itr {
+ struct hashtable *h;
+ struct entry *e;
+ struct entry *parent;
+ unsigned int index;
+};
+
+/*****************************************************************************/
+/* hashtable_iterator - iterator constructor */
+
+struct hashtable_itr *
+hashtable_iterator(struct hashtable *h)
+{
+ unsigned int i, tablelength;
+ struct hashtable_itr *itr = (struct hashtable_itr *)
+ malloc(sizeof(struct hashtable_itr));
+ if (NULL == itr) return NULL;
+ itr->h = h;
+ itr->e = NULL;
+ itr->parent = NULL;
+ tablelength = h->tablelength;
+ itr->index = tablelength;
+ if (0 == h->entrycount) return itr;
+
+ for (i = 0; i < tablelength; i++)
+ {
+ if (NULL != h->table[i])
+ {
+ itr->e = h->table[i];
+ itr->index = i;
+ break;
+ }
+ }
+ return itr;
+}
+
+/*****************************************************************************/
+/* key - return the key of the (key,value) pair at the current position */
+/* value - return the value of the (key,value) pair at the current position */
+
+void *
+hashtable_iterator_key(struct hashtable_itr *i)
+{ return i->e->k; }
+
+void *
+hashtable_iterator_value(struct hashtable_itr *i)
+{ return i->e->v; }
+
+/*****************************************************************************/
+/* advance - advance the iterator to the next element
+ * returns zero if advanced to end of table */
+
+int
+hashtable_iterator_advance(struct hashtable_itr *itr)
+{
+ unsigned int j,tablelength;
+ struct entry **table;
+ struct entry *next;
+ if (NULL == itr->e) return 0; /* stupidity check */
+
+ next = itr->e->next;
+ if (NULL != next)
+ {
+ itr->parent = itr->e;
+ itr->e = next;
+ return -1;
+ }
+ tablelength = itr->h->tablelength;
+ itr->parent = NULL;
+ if (tablelength <= (j = ++(itr->index)))
+ {
+ itr->e = NULL;
+ return 0;
+ }
+ table = itr->h->table;
+ while (NULL == (next = table[j]))
+ {
+ if (++j >= tablelength)
+ {
+ itr->index = tablelength;
+ itr->e = NULL;
+ return 0;
+ }
+ }
+ itr->index = j;
+ itr->e = next;
+ return -1;
+}
+
+/*****************************************************************************/
+/* remove - remove the entry at the current iterator position
+ * and advance the iterator, if there is a successive
+ * element.
+ * If you want the value, read it before you remove:
+ * beware memory leaks if you don't.
+ * Returns zero if end of iteration. */
+
+int
+hashtable_iterator_remove(struct hashtable_itr *itr)
+{
+ struct entry *remember_e, *remember_parent;
+ int ret;
+
+ /* Do the removal */
+ if (NULL == (itr->parent))
+ {
+ /* element is head of a chain */
+ itr->h->table[itr->index] = itr->e->next;
+ } else {
+ /* element is mid-chain */
+ itr->parent->next = itr->e->next;
+ }
+ /* itr->e is now outside the hashtable */
+ remember_e = itr->e;
+ itr->h->entrycount--;
+ freekey(remember_e->k);
+
+ /* Advance the iterator, correcting the parent */
+ remember_parent = itr->parent;
+ ret = hashtable_iterator_advance(itr);
+ if (itr->parent == remember_e) { itr->parent = remember_parent; }
+ free(remember_e);
+ return ret;
+}
+
+/*****************************************************************************/
+int /* returns zero if not found */
+hashtable_iterator_search(struct hashtable_itr *itr,
+ struct hashtable *h, void *k)
+{
+ struct entry *e, *parent;
+ unsigned int hashvalue, index;
+
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hashvalue);
+
+ e = h->table[index];
+ parent = NULL;
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+ {
+ itr->index = index;
+ itr->e = e;
+ itr->parent = parent;
+ itr->h = h;
+ return -1;
+ }
+ parent = e;
+ e = e->next;
+ }
+ return 0;
+}
+
+
+/*
+ * Copyright (c) 2002, 2004, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
--- /dev/null
+/* Copyright (C) 2002, 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#ifndef __HASHTABLE_ITR_CWC22__
+#define __HASHTABLE_ITR_CWC22__
+#include "hashtable.h"
+#include "hashtable_private.h" /* needed to enable inlining */
+
+struct hashtable_itr;
+
+/*****************************************************************************/
+/* hashtable_iterator
+ */
+
+struct hashtable_itr *
+hashtable_iterator(struct hashtable *h);
+
+/*****************************************************************************/
+/* hashtable_iterator_key
+ * - return the value of the (key,value) pair at the current position */
+
+void *
+hashtable_iterator_key(struct hashtable_itr *i);
+
+/*****************************************************************************/
+/* value - return the value of the (key,value) pair at the current position */
+
+void *
+hashtable_iterator_value(struct hashtable_itr *i);
+
+/*****************************************************************************/
+/* advance - advance the iterator to the next element
+ * returns zero if advanced to end of table */
+
+int
+hashtable_iterator_advance(struct hashtable_itr *itr);
+
+/*****************************************************************************/
+/* remove - remove current element and advance the iterator to the next element
+ * NB: if you need the value to free it, read it before
+ * removing. ie: beware memory leaks!
+ * returns zero if advanced to end of table */
+
+int
+hashtable_iterator_remove(struct hashtable_itr *itr);
+
+/*****************************************************************************/
+/* search - overwrite the supplied iterator, to point to the entry
+ * matching the supplied key.
+ h points to the hashtable to be searched.
+ * returns zero if not found. */
+int
+hashtable_iterator_search(struct hashtable_itr *itr,
+ struct hashtable *h, void *k);
+
+#define DEFINE_HASHTABLE_ITERATOR_SEARCH(fnname, keytype) \
+int fnname (struct hashtable_itr *i, struct hashtable *h, keytype *k) \
+{ \
+ return (hashtable_iterator_search(i,h,k)); \
+}
+
+
+
+#endif /* __HASHTABLE_ITR_CWC22__*/
+
+/*
+ * Copyright (c) 2002, 2004, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
--- /dev/null
+/* Copyright (C) 2002, 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+/*
+ * There are duplicates of this code in:
+ * - tools/xenstore/hashtable_private.h
+ */
+
+#ifndef __HASHTABLE_PRIVATE_CWC22_H__
+#define __HASHTABLE_PRIVATE_CWC22_H__
+
+#include "hashtable.h"
+
+/*****************************************************************************/
+struct entry
+{
+ void *k, *v;
+ unsigned int h;
+ struct entry *next;
+};
+
+struct hashtable {
+ unsigned int tablelength;
+ struct entry **table;
+ unsigned int entrycount;
+ unsigned int loadlimit;
+ unsigned int primeindex;
+ unsigned int (*hashfn) (void *k);
+ int (*eqfn) (void *k1, void *k2);
+};
+
+/*****************************************************************************/
+unsigned int
+hash(struct hashtable *h, void *k);
+
+/*****************************************************************************/
+/* indexFor */
+static inline unsigned int
+indexFor(unsigned int tablelength, unsigned int hashvalue) {
+ return (hashvalue % tablelength);
+};
+
+/* Only works if tablelength == 2^N */
+/*static inline unsigned int
+indexFor(unsigned int tablelength, unsigned int hashvalue)
+{
+ return (hashvalue & (tablelength - 1u));
+}
+*/
+
+/*****************************************************************************/
+#define freekey(X) free(X)
+/*define freekey(X) ; */
+
+
+/*****************************************************************************/
+
+#endif /* __HASHTABLE_PRIVATE_CWC22_H__*/
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
--- /dev/null
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include "hashtable_utility.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/*****************************************************************************/
+/* hashtable_change
+ *
+ * function to change the value associated with a key, where there already
+ * exists a value bound to the key in the hashtable.
+ * Source due to Holger Schemel.
+ *
+ * */
+int
+hashtable_change(struct hashtable *h, void *k, void *v)
+{
+ struct entry *e;
+ unsigned int hashvalue, index;
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hashvalue);
+ e = h->table[index];
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+ {
+ free(e->v);
+ e->v = v;
+ return -1;
+ }
+ e = e->next;
+ }
+ return 0;
+}
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
--- /dev/null
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#ifndef __HASHTABLE_CWC22_UTILITY_H__
+#define __HASHTABLE_CWC22_UTILITY_H__
+
+/*****************************************************************************
+ * hashtable_change
+ *
+ * function to change the value associated with a key, where there already
+ * exists a value bound to the key in the hashtable.
+ * Source due to Holger Schemel.
+ *
+ * @name hashtable_change
+ * @param h the hashtable
+ * @param key
+ * @param value
+ *
+ */
+int
+hashtable_change(struct hashtable *h, void *k, void *v);
+
+#endif /* __HASHTABLE_CWC22_H__ */
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
--- /dev/null
+/* img2qcow.c
+ *
+ * Generates a qcow format disk and fills it from an existing image.
+ *
+ * (c) 2006 Julian Chesterfield and Andrew Warfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <zlib.h>
+#include <inttypes.h>
+#include <libaio.h>
+
+#include "bswap.h"
+#include "aes.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+#include "qcow.h"
+#include "blk.h"
+
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+#define BLOCK_PROCESSSZ 4096
+#define QCOW_VBD 0
+#define PROGRESS_QUANT 2
+
+static int running = 1, complete = 0;
+static int returned_events = 0, submit_events = 0;
+static uint32_t read_idx = 0;
+td_driver_t *ddqcow;
+td_vbd_t* qcow_vbd;
+static uint64_t prev = 0, written = 0;
+static char output[(100/PROGRESS_QUANT) + 5];
+
+extern tapdisk_server_t server;
+
+
+static void print_bytes(void *ptr, int length)
+{
+ int i,k;
+ unsigned char *p = ptr;
+
+ DFPRINTF("Buf dump, length %d:\n",length);
+ for (k = 0; k < length; k++) {
+ DFPRINTF("%x",*p);
+ *p++;
+ if(k % 16 == 0) DFPRINTF("\n");
+ else if(k % 2 == 0) DFPRINTF(" ");
+ }
+ DFPRINTF("\n");
+ return;
+}
+
+static void debug_output(uint64_t progress, uint64_t size)
+{
+ //Output progress every PROGRESS_QUANT
+ uint64_t blocks = size/(100/PROGRESS_QUANT);
+
+ if (progress/blocks > prev) {
+ memcpy(output+prev+1,"=>",2);
+ prev++;
+ DFPRINTF("\r%s %"PRIi64"%%",
+ output, (int64_t)((prev-1)*PROGRESS_QUANT));
+ }
+ return;
+}
+
+static int get_image_info(td_disk_info_t *driver, int fd)
+{
+ int ret;
+ long size;
+ unsigned long total_size;
+ struct statvfs statBuf;
+ struct stat stat;
+ uint64_t sector_size=DEFAULT_SECTOR_SIZE;
+
+ ret = fstat(fd, &stat);
+ if (ret != 0) {
+ DFPRINTF("ERROR: fstat failed, Couldn't stat image");
+ return -EINVAL;
+ }
+
+ if (S_ISBLK(stat.st_mode)) {
+ /*Accessing block device directly*/
+ if (blk_getimagesize(fd, &driver->size) != 0)
+ return -EINVAL;
+
+ DFPRINTF("Image size: \n\tpre sector_shift [%"PRIu64"]\n\tpost "
+ "sector_shift [%"PRIu64"]\n",
+ (uint64_t)(driver->size << SECTOR_SHIFT),
+ (uint64_t)driver->size);
+
+ /*Get the sector size*/
+ if (!blk_getsectorsize(fd, §or_size))
+ driver->sector_size = sector_size;
+
+ } else {
+ /*Local file? try fstat instead*/
+ driver->size = (stat.st_size >> SECTOR_SHIFT);
+ driver->sector_size = DEFAULT_SECTOR_SIZE;
+ DFPRINTF("Image size: [%"PRIu64"]\n",
+ (uint64_t)driver->size);
+ }
+
+ return 0;
+}
+
+void send_responses(td_request_t treq, int err)
+{
+ if (err < 0) {
+ DFPRINTF("AIO FAILURE: res [%d]!\n",err);
+ return;
+ }
+
+ returned_events++;
+
+ free(treq.buf);
+}
+
+int main(int argc, const char *argv[])
+{
+ int ret = -1, fd, len, err;
+ struct timeval timeout;
+ uint64_t i;
+ char *buf = NULL;
+ td_request_t treq;
+ td_disk_info_t info;
+ td_vbd_request_t* vreq;
+
+ if (argc != 3) {
+ fprintf(stderr, "Qcow-utils: v1.0.0\n");
+ fprintf(stderr, "usage: %s <QCOW FILENAME> <SRC IMAGE>\n",
+ argv[0]);
+ exit(-1);
+ }
+
+
+ /*Open image*/
+ fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+
+ if (fd == -1) {
+ DFPRINTF("Unable to open [%s], (err %d)!\n",argv[2],0 - errno);
+ exit(-1);
+ }
+
+ get_image_info(&info, fd);
+
+ /*Create qcow file*/
+ ret = qcow_create(argv[1],info.size<<SECTOR_SHIFT,NULL,0);
+
+ if (ret < 0) {
+ DFPRINTF("Unable to create QCOW file\n");
+ exit(-1);
+ } else DFPRINTF("Qcow file created: size %"PRIu64" sectors\n",
+ (uint64_t)info.size);
+
+ /* Open Qcow image*/
+ err = tapdisk_server_initialize();
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't initialize server instance.\n");
+ return err;
+ }
+
+ err=tapdisk_vbd_initialize(QCOW_VBD);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't initialize qcow vbd.\n");
+ return err;
+ }
+
+ qcow_vbd = tapdisk_server_get_vbd(QCOW_VBD);
+ if (!qcow_vbd) {
+ err = -ENODEV;
+ DPRINTF("qcow2raw Couldn't create qcow vbd.\n");
+ return err;
+ }
+
+ err = tapdisk_vbd_open_vdi(qcow_vbd, argv[1], DISK_TYPE_QCOW,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ 0);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't open qcow file.\n");
+ return err;
+ }
+
+ ddqcow=(tapdisk_vbd_first_image(qcow_vbd))->driver;
+
+ /*Initialise the output string*/
+ memset(output,0x20,(100/PROGRESS_QUANT)+5);
+ output[0] = '[';
+ output[(100/PROGRESS_QUANT)+2] = ']';
+ output[(100/PROGRESS_QUANT)+3] = '\0';
+ DFPRINTF("%s",output);
+
+ i = 0;
+ while (running) {
+
+ if (!complete) {
+ /*Read sector from image*/
+ if (lseek(fd, i*512, SEEK_SET) == (off_t)-1) {
+ DFPRINTF("Unable to access file offset %"PRIu64"\n",
+ (uint64_t)i*512);
+ exit(-1);
+ }
+
+ if( (ret = posix_memalign((void **)&buf,
+ BLOCK_PROCESSSZ,
+ BLOCK_PROCESSSZ)) != 0) {
+ DFPRINTF("Unable to read memalign buf (%d)\n",ret);
+ exit(-1);
+ }
+
+ /*We attempt to read 4k sized blocks*/
+ len = read(fd, buf, BLOCK_PROCESSSZ);
+ if (len < 512) {
+ DFPRINTF("Unable to read sector %"PRIu64"\n",
+ (uint64_t) (i));
+ complete = 1;
+ continue;
+ }
+
+ len = (len >> 9);
+
+ treq.op = TD_OP_WRITE;
+ treq.buf = buf;
+ treq.sec = i;
+ treq.secs = len;
+ treq.image = 0;
+ treq.cb = send_responses;
+ treq.cb_data = buf;
+ treq.id = 0;
+ treq.sidx = 0;
+ vreq = calloc(1, sizeof(td_vbd_request_t));
+ treq.private = vreq;
+
+ vreq->submitting = 1;
+ INIT_LIST_HEAD(&vreq->next);
+ tapdisk_vbd_move_request(treq.private,
+ &qcow_vbd->pending_requests);
+
+ ddqcow->ops->td_queue_write(ddqcow,treq);
+ --vreq->submitting;
+
+ submit_events++;
+
+ i += len;
+
+ if (i == info.size)
+ complete = 1;
+
+ tapdisk_submit_all_tiocbs(&server.aio_queue);
+ debug_output(i,info.size);
+ }
+
+ while(returned_events != submit_events) {
+ ret = scheduler_wait_for_events(&server.scheduler);
+ if (ret < 0) {
+ DFPRINTF("server wait returned %d\n", ret);
+ sleep(2);
+ }
+ }
+
+ if (complete && (returned_events == submit_events))
+ running = 0;
+ }
+ memcpy(output+prev+1,"=",1);
+ DFPRINTF("\r%s 100%%\nTRANSFER COMPLETE\n\n", output);
+
+ ddqcow->ops->td_close(ddqcow);
+ free(ddqcow->data);
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include "io-optimize.h"
+#include "tapdisk-log.h"
+
+#if (!defined(TEST) && defined(DEBUG))
+#define DBG(ctx, f, a...) tlog_write(TLOG_DBG, f, ##a)
+#elif defined(TEST)
+#define DBG(ctx, f, a...) printf(f, ##a)
+#else
+#define DBG(ctx, f, a...) ((void)0)
+#endif
+
+static void print_merged_iocbs(struct opioctx *ctx,
+ struct iocb **iocbs, int num_iocbs);
+
+void
+opio_free(struct opioctx *ctx)
+{
+ free(ctx->opios);
+ ctx->opios = NULL;
+
+ free(ctx->free_opios);
+ ctx->free_opios = NULL;
+
+ free(ctx->iocb_queue);
+ ctx->iocb_queue = NULL;
+
+ free(ctx->event_queue);
+ ctx->event_queue = NULL;
+}
+
+int
+opio_init(struct opioctx *ctx, int num_iocbs)
+{
+ int i;
+
+ memset(ctx, 0, sizeof(struct opioctx));
+
+ ctx->num_opios = num_iocbs;
+ ctx->free_opio_cnt = num_iocbs;
+ ctx->opios = calloc(1, sizeof(struct opio) * num_iocbs);
+ ctx->free_opios = calloc(1, sizeof(struct opio *) * num_iocbs);
+ ctx->iocb_queue = calloc(1, sizeof(struct iocb *) * num_iocbs);
+ ctx->event_queue = calloc(1, sizeof(struct io_event) * num_iocbs);
+
+ if (!ctx->opios || !ctx->free_opios ||
+ !ctx->iocb_queue || !ctx->event_queue)
+ goto fail;
+
+ for (i = 0; i < num_iocbs; i++)
+ ctx->free_opios[i] = &ctx->opios[i];
+
+ return 0;
+
+ fail:
+ opio_free(ctx);
+ return -ENOMEM;
+}
+
+static inline struct opio *
+alloc_opio(struct opioctx *ctx)
+{
+ if (ctx->free_opio_cnt <= 0)
+ return NULL;
+ return ctx->free_opios[--ctx->free_opio_cnt];
+}
+
+static inline void
+free_opio(struct opioctx *ctx, struct opio *op)
+{
+ memset(op, 0, sizeof(struct opio));
+ ctx->free_opios[ctx->free_opio_cnt++] = op;
+}
+
+static inline void
+restore_iocb(struct opio *op)
+{
+ struct iocb *io = op->iocb;
+
+ io->data = op->data;
+ io->u.c.buf = op->buf;
+ io->u.c.nbytes = op->nbytes;
+}
+
+static inline int
+iocb_optimized(struct opioctx *ctx, struct iocb *io)
+{
+ unsigned long iop = (unsigned long)io->data;
+ unsigned long start = (unsigned long)ctx->opios;
+ unsigned long end = start + (ctx->num_opios * sizeof(struct opio));
+
+ return (iop >= start && iop < end);
+}
+
+static inline int
+contiguous_sectors(struct iocb *l, struct iocb *r)
+{
+ return (l->u.c.offset + l->u.c.nbytes == r->u.c.offset);
+}
+
+static inline int
+contiguous_buffers(struct iocb *l, struct iocb *r)
+{
+ return (l->u.c.buf + l->u.c.nbytes == r->u.c.buf);
+}
+
+static inline int
+contiguous_iocbs(struct iocb *l, struct iocb *r)
+{
+ return ((l->aio_fildes == r->aio_fildes) &&
+ contiguous_sectors(l, r) &&
+ contiguous_buffers(l, r));
+}
+
+static inline void
+init_opio_list(struct opio *op)
+{
+ op->list.head = op->list.tail = op;
+}
+
+static struct opio *
+opio_iocb_init(struct opioctx *ctx, struct iocb *io)
+{
+ struct opio *op;
+
+ op = alloc_opio(ctx);
+ if (!op)
+ return NULL;
+
+ op->buf = io->u.c.buf;
+ op->nbytes = io->u.c.nbytes;
+ op->offset = io->u.c.offset;
+ op->data = io->data;
+ op->iocb = io;
+ io->data = op;
+
+ init_opio_list(op);
+
+ return op;
+}
+
+static inline struct opio *
+opio_get(struct opioctx *ctx, struct iocb *io)
+{
+ if (iocb_optimized(ctx, io))
+ return (struct opio *)io->data;
+ else
+ return opio_iocb_init(ctx, io);
+}
+
+static int
+merge_tail(struct opioctx *ctx, struct iocb *head, struct iocb *io)
+{
+ struct opio *ophead, *opio;
+
+ ophead = opio_get(ctx, head);
+ if (!ophead)
+ return -ENOMEM;
+
+ opio = opio_get(ctx, io);
+ if (!opio)
+ return -ENOMEM;
+
+ opio->head = ophead;
+ head->u.c.nbytes += io->u.c.nbytes;
+ ophead->list.tail = ophead->list.tail->next = opio;
+
+ return 0;
+}
+
+static int
+merge(struct opioctx *ctx, struct iocb *head, struct iocb *io)
+{
+ if (head->aio_lio_opcode != io->aio_lio_opcode)
+ return -EINVAL;
+
+ if (!contiguous_iocbs(head, io))
+ return -EINVAL;
+
+ return merge_tail(ctx, head, io);
+}
+
+int
+io_merge(struct opioctx *ctx, struct iocb **queue, int num)
+{
+ int i, on_queue;
+ struct iocb *io, **q;
+ struct opio *ophead;
+
+ if (!num)
+ return 0;
+
+ on_queue = 0;
+ q = ctx->iocb_queue;
+ memcpy(q, queue, num * sizeof(struct iocb *));
+
+ for (i = 1; i < num; i++) {
+ io = q[i];
+ if (merge(ctx, queue[on_queue], io) != 0)
+ queue[++on_queue] = io;
+ }
+
+#if (defined(TEST) || defined(DEBUG))
+ print_merged_iocbs(ctx, queue, on_queue + 1);
+#endif
+
+ return ++on_queue;
+}
+
+static int
+expand_iocb(struct opioctx *ctx, struct iocb **queue, struct iocb *io)
+{
+ int idx;
+ struct opio *op, *next;
+
+ idx = 0;
+ op = (struct opio *)io->data;
+ while (op) {
+ next = op->next;
+ restore_iocb(op);
+ queue[idx++] = op->iocb;
+ free_opio(ctx, op);
+ op = next;
+ }
+
+ return idx;
+}
+
+int
+io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num)
+{
+ int i, on_queue;
+ struct iocb *io, **q;
+
+ if (!num)
+ return 0;
+
+ on_queue = 0;
+ q = ctx->iocb_queue;
+ memcpy(q, queue, num * sizeof(struct iocb *));
+
+ for (i = idx; i < num; i++) {
+ io = q[i];
+ if (!iocb_optimized(ctx, io))
+ queue[on_queue++] = io;
+ else
+ on_queue += expand_iocb(ctx, queue + on_queue, io);
+ }
+
+ return on_queue;
+}
+
+static int
+expand_event(struct opioctx *ctx,
+ struct io_event *event, struct io_event *queue, int idx)
+{
+ int err;
+ struct iocb *io;
+ struct io_event *ep;
+ struct opio *ophead, *op, *next;
+
+ io = event->obj;
+ ophead = (struct opio *)io->data;
+ op = ophead;
+
+ if (event->res == io->u.c.nbytes)
+ err = 0;
+ else if ((int)event->res < 0)
+ err = (int)event->res;
+ else
+ err = -EIO;
+
+ while (op) {
+ next = op->next;
+ ep = &queue[idx++];
+ ep->obj = op->iocb;
+ ep->res = (err ? err : op->nbytes);
+ restore_iocb(op);
+ free_opio(ctx, op);
+ op = next;
+ }
+
+ return idx;
+}
+
+int
+io_split(struct opioctx *ctx, struct io_event *events, int num)
+{
+ int on_queue;
+ struct iocb *io;
+ struct io_event *ep, *q;
+
+ if (!num)
+ return 0;
+
+ on_queue = 0;
+ q = ctx->event_queue;
+ memcpy(q, events, num * sizeof(struct io_event));
+
+ for (ep = q; num-- > 0; ep++) {
+ io = ep->obj;
+ if (!iocb_optimized(ctx, io))
+ events[on_queue++] = *ep;
+ else
+ on_queue = expand_event(ctx, ep, events, on_queue);
+ }
+
+ return on_queue;
+}
+
+/******************************************************************************
+debug print functions
+******************************************************************************/
+static inline void
+__print_iocb(struct opioctx *ctx, struct iocb *io, char *prefix)
+{
+ char *type;
+
+ type = (io->aio_lio_opcode == IO_CMD_PREAD ? "read" : "write");
+
+ DBG(ctx, "%soff: %08llx, nbytes: %04lx, buf: %p, type: %s, data: %08lx,"
+ " optimized: %d\n", prefix, io->u.c.offset, io->u.c.nbytes,
+ io->u.c.buf, type, (unsigned long)io->data,
+ iocb_optimized(ctx, io));
+}
+
+static char *null_prefix = "";
+#define print_iocb(ctx, io) __print_iocb(ctx, io, null_prefix)
+
+static void
+print_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs)
+{
+ int i;
+ char pref[10];
+ struct iocb *io;
+
+ DBG(ctx, "iocbs:\n");
+ for (i = 0; i < num_iocbs; i++) {
+ io = iocbs[i];
+ snprintf(pref, 10, "%d: ", i);
+ __print_iocb(ctx, io, pref);
+ }
+}
+
+static void
+print_optimized_iocbs(struct opioctx *ctx, struct opio *op, int *cnt)
+{
+ char pref[10];
+
+ while (op) {
+ snprintf(pref, 10, " %d: ", (*cnt)++);
+ __print_iocb(ctx, op->iocb, pref);
+ op = op->next;
+ }
+}
+
+static void
+print_merged_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs)
+{
+ int i, cnt;
+ char pref[10];
+ struct iocb *io;
+ struct opio *op;
+
+ DBG(ctx, "merged iocbs:\n");
+ for (i = 0, cnt = 0; i < num_iocbs; i++) {
+ io = iocbs[i];
+ snprintf(pref, 10, "%d: ", cnt++);
+ __print_iocb(ctx, io, pref);
+
+ if (iocb_optimized(ctx, io)) {
+ op = (struct opio *)io->data;
+ print_optimized_iocbs(ctx, op->next, &cnt);
+ }
+ }
+}
+
+static void
+print_events(struct opioctx *ctx, struct io_event *events, int num_events)
+{
+ int i;
+ struct iocb *io;
+
+ for (i = 0; i < num_events; i++) {
+ io = events[i].obj;
+ print_iocb(ctx, io);
+ }
+}
+/******************************************************************************
+end debug print functions
+******************************************************************************/
+
+#if defined(TEST)
+
+#define hmask 0x80000000UL
+#define smask 0x40000000UL
+#define make_data(idx, is_head, sparse) \
+ (void *)((idx) | ((is_head) ? hmask : 0) | ((sparse) ? smask : 0))
+#define data_idx(data) (int)((unsigned long)(data) & (0x0fffffff))
+#define data_is_head(data) (((unsigned long)(data) & hmask) ? 1 : 0)
+#define data_is_sparse(data) (((unsigned long)(data) & smask) ? 1 : 0)
+
+static void
+usage(void)
+{
+ fprintf(stderr, "usage: io_optimize [-n num_runs] "
+ "[-i num_iocbs] [-s num_secs] [-r random_seed]\n");
+ exit(-1);
+}
+
+static int xalloc_cnt, xfree_cnt;
+static inline char *
+xalloc(int size)
+{
+ char *buf = malloc(size);
+ if (!buf) {
+ fprintf(stderr, "xalloc failed\n");
+ exit(ENOMEM);
+ }
+ xalloc_cnt++;
+ return buf;
+}
+
+static inline void
+xfree(void *buf)
+{
+ free(buf);
+ xfree_cnt++;
+}
+
+static void
+randomize_iocbs(struct iocb **iocbs, int num_iocbs, int num_secs)
+{
+ int i, j;
+
+ i = 0;
+ while (i < num_iocbs) {
+ char *buf;
+ short type;
+ int segs, sparse_mem;
+ uint64_t offset, nbytes;
+
+ type = (random() % 10 < 5 ? IO_CMD_PREAD : IO_CMD_PWRITE);
+ offset = ((random() % num_secs) << 9);
+
+ if (random() % 10 < 4) {
+ segs = 1;
+ nbytes = (((random() % 7) + 1) << 9);
+ } else {
+ segs = (random() % 10) + 1;
+ nbytes = 4096;
+ }
+
+ if (i + segs > num_iocbs)
+ segs = (num_iocbs - i);
+
+ sparse_mem = (random() % 10 < 2 ? 1 : 0);
+
+ if (sparse_mem)
+ buf = xalloc(nbytes);
+ else
+ buf = xalloc(segs * nbytes);
+
+ for (j = 0; j < segs; j++) {
+ struct iocb *io = iocbs[i + j];
+ io->aio_lio_opcode = type;
+ io->u.c.nbytes = nbytes;
+ io->u.c.offset = offset;
+ io->u.c.buf = buf;
+ offset += nbytes;
+
+ io->data = make_data(i + j, (j == 0), sparse_mem);
+
+ if (j + 1 < segs && sparse_mem)
+ buf = xalloc(nbytes);
+ else
+ buf += nbytes;
+ }
+
+ i += segs;
+ }
+}
+
+static int
+simulate_io(struct iocb **iocbs, struct io_event *events, int num_iocbs)
+{
+ int i, done;
+ struct iocb *io;
+ struct io_event *ep;
+
+ if (num_iocbs > 1)
+ done = (random() % (num_iocbs - 1)) + 1;
+ else
+ done = num_iocbs;
+
+ for (i = 0; i < done; i++) {
+ io = iocbs[i];
+ ep = &events[i];
+ ep->obj = io;
+ ep->res = (random() % 10 < 8 ? io->u.c.nbytes : 0);
+ }
+
+ return done;
+}
+
+static inline void
+process_events(struct opioctx *ctx,
+ struct iocb *iocb_list, struct io_event *events, int num)
+{
+ int i;
+ struct iocb *io;
+
+ for (i = 0; i < num; i++) {
+ io = events[i].obj;
+ print_iocb(ctx, io);
+ if (data_idx(io->data) != (io - iocb_list)) {
+ printf("corrupt data! data_idx = %d, io = %d\n",
+ data_idx(io->data), (io - iocb_list));
+ exit(-1);
+ }
+ if (data_is_head(io->data) || data_is_sparse(io->data))
+ xfree(io->u.c.buf);
+ memset(io, 0, sizeof(struct iocb));
+ }
+}
+
+static inline void
+init_optest(struct iocb *iocb_list,
+ struct iocb **iocbs, struct io_event *events, int num)
+{
+ int i;
+
+ memset(iocb_list, 0, num * sizeof(struct iocb));
+ memset(events, 0, num * sizeof(struct io_event));
+
+ for (i = 0; i < num; i++)
+ iocbs[i] = &iocb_list[i];
+}
+
+int
+main(int argc, char **argv)
+{
+ uint64_t num_secs;
+ struct opioctx ctx;
+ struct io_event *events;
+ int i, c, num_runs, num_iocbs, seed;
+ struct iocb *iocb_list, **iocbs, **ioqueue;
+
+ num_runs = 1;
+ num_iocbs = 300;
+ seed = time(NULL);
+ num_secs = ((4ULL << 20) >> 9); /* 4GB disk */
+
+ while ((c = getopt(argc, argv, "n:i:s:r:h")) != -1) {
+ switch (c) {
+ case 'n':
+ num_runs = atoi(optarg);
+ break;
+ case 'i':
+ num_iocbs = atoi(optarg);
+ break;
+ case 's':
+ num_secs = strtoull(optarg, NULL, 10);
+ break;
+ case 'r':
+ seed = atoi(optarg);
+ break;
+ case 'h':
+ usage();
+ case '?':
+ fprintf(stderr, "Unrecognized option: -%c\n", optopt);
+ usage();
+ }
+ }
+
+ printf("Running %d tests with %d iocbs on %llu sectors, seed = %d\n",
+ num_runs, num_iocbs, num_secs, seed);
+
+ srand(seed);
+
+ iocb_list = malloc(num_iocbs * sizeof(struct iocb));
+ iocbs = malloc(num_iocbs * sizeof(struct iocb *));
+ events = malloc(num_iocbs * sizeof(struct io_event));
+
+ if (!iocb_list || !iocbs || !events || opio_init(&ctx, num_iocbs)) {
+ fprintf(stderr, "initialization failed\n");
+ exit(ENOMEM);
+ }
+
+ for (i = 0; i < num_runs; i++) {
+ int op_rem, op_done, num_split, num_events, num_done;
+
+ ioqueue = iocbs;
+ init_optest(iocb_list, ioqueue, events, num_iocbs);
+ randomize_iocbs(ioqueue, num_iocbs, num_secs);
+ print_iocbs(&ctx, ioqueue, num_iocbs);
+
+ op_done = 0;
+ num_done = 0;
+ op_rem = io_merge(&ctx, ioqueue, num_iocbs);
+ print_iocbs(&ctx, ioqueue, op_rem);
+ print_merged_iocbs(&ctx, ioqueue, op_rem);
+
+ while (num_done < num_iocbs) {
+ DBG(&ctx, "optimized remaining: %d\n", op_rem);
+
+ DBG(&ctx, "simulating\n");
+ num_events = simulate_io(ioqueue + op_done, events, op_rem);
+ print_events(&ctx, events, num_events);
+
+ DBG(&ctx, "splitting %d\n", num_events);
+ num_split = io_split(&ctx, events, num_events);
+ print_events(&ctx, events, num_split);
+
+ DBG(&ctx, "processing %d\n", num_split);
+ process_events(&ctx, iocb_list, events, num_split);
+
+ op_rem -= num_events;
+ op_done += num_events;
+ num_done += num_split;
+ }
+
+ DBG(&ctx, "run %d: processed: %d, xallocs: %d, xfrees: %d\n",
+ i, num_done, xalloc_cnt, xfree_cnt);
+ if (xalloc_cnt != xfree_cnt)
+ exit(-1);
+ xalloc_cnt = xfree_cnt = 0;
+ }
+
+ free(iocbs);
+ free(events);
+ free(iocb_list);
+ opio_free(&ctx);
+
+ return 0;
+}
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __IO_OPTIMIZE_H__
+#define __IO_OPTIMIZE_H__
+
+#include <libaio.h>
+
+struct opio;
+
+struct opio_list {
+ struct opio *head;
+ struct opio *tail;
+};
+
+struct opio {
+ char *buf;
+ unsigned long nbytes;
+ long long offset;
+ void *data;
+ struct iocb *iocb;
+ struct io_event event;
+ struct opio *head;
+ struct opio *next;
+ struct opio_list list;
+};
+
+struct opioctx {
+ int num_opios;
+ int free_opio_cnt;
+ struct opio *opios;
+ struct opio **free_opios;
+ struct iocb **iocb_queue;
+ struct io_event *event_queue;
+};
+
+int opio_init(struct opioctx *ctx, int num_iocbs);
+void opio_free(struct opioctx *ctx);
+int io_merge(struct opioctx *ctx, struct iocb **queue, int num);
+int io_split(struct opioctx *ctx, struct io_event *events, int num);
+int io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2010, XenSource Inc.
+ * All rights reserved.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * kernel 2.6.21 added eventfd(2) support, kernel 2.6.22 eventfds for
+ * aio. libaio 0.3.107 updated the header file, but few systems have
+ * it. define a custom iocb_common struct instead, and work around a
+ * potentially missing sys/eventfd.h. this header should vanish over
+ * time.
+ */
+
+#ifndef __LIBAIO_COMPAT
+#define __LIBAIO_COMPAT
+
+#include "../../config.h"
+#include <libaio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+struct __compat_io_iocb_common {
+ char __pad_buf[8];
+ char __pad_nbytes[8];
+ long long offset;
+ long long __pad3;
+ unsigned flags;
+ unsigned resfd;
+};
+
+static inline void __io_set_eventfd(struct iocb *iocb, int eventfd)
+{
+ struct __compat_io_iocb_common *c;
+ c = (struct __compat_io_iocb_common*)&iocb->u.c;
+ c->flags |= (1 << 0);
+ c->resfd = eventfd;
+}
+
+#ifdef HAVE_SYS_EVENTFD_H
+
+#include <sys/eventfd.h>
+
+static inline int tapdisk_sys_eventfd(int initval)
+{
+ return eventfd(initval, 0);
+}
+
+#else /* Fallback */
+#ifndef SYS_eventfd
+#ifndef __NR_eventfd
+# if defined(__alpha__)
+# define __NR_eventfd 478
+# elif defined(__arm__)
+# define __NR_eventfd (__NR_SYSCALL_BASE+351)
+# elif defined(__ia64__)
+# define __NR_eventfd 1309
+# elif defined(__i386__)
+# define __NR_eventfd 323
+# elif defined(__m68k__)
+# define __NR_eventfd 319
+# elif 0 && defined(__mips__)
+# error __NR_eventfd?
+# define __NR_eventfd (__NR_Linux + 319)
+# define __NR_eventfd (__NR_Linux + 278)
+# define __NR_eventfd (__NR_Linux + 282)
+# elif defined(__hppa__)
+# define __NR_eventfd (__NR_Linux + 304)
+# elif defined(__PPC__) || defined(__powerpc64__)
+# define __NR_eventfd 307
+# elif defined(__s390__) || defined(__s390x__)
+# define __NR_eventfd 318
+# elif defined(__sparc__)
+# define __NR_eventfd 313
+# elif defined(__x86_64__)
+# define __NR_eventfd 284
+# endif
+#else
+# error __NR_eventfd?
+#endif
+#define SYS_eventfd __NR_eventfd
+#endif
+
+static inline int tapdisk_sys_eventfd(int initval)
+{
+ return syscall(SYS_eventfd, initval, 0);
+}
+#endif
+
+#endif /* __LIBAIO_COMPAT */
--- /dev/null
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This module implements a "dot locking" style advisory file locking algorithm.
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <dirent.h>
+#include <limits.h>
+#include "lock.h"
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+/* format: xenlk.hostname.uuid.<xf><rw>*/
+#define LF_POSTFIX ".xenlk"
+#define LFXL_FORMAT LF_POSTFIX ".%s.%s.x%s"
+#define LFFL_FORMAT LF_POSTFIX ".%s.%s.f%s"
+#define RETRY_MAX 16
+
+#if defined(LOGS)
+#define LOG(format, args...) printf("%d: ", __LINE__); printf(format, ## args)
+#else
+#define LOG(format, args...)
+#endif
+
+/* random wait - up to .5 seconds */
+#define XSLEEP usleep(random() & 0x7ffff)
+
+typedef int (*eval_func)(char *name, int readonly);
+
+static char *create_lockfn(char *fn_to_lock)
+{
+ char *lockfn;
+
+ /* allocate string to hold constructed lock file */
+ lockfn = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) + 1);
+ if (unlikely(!lockfn)) {
+ return 0;
+ }
+
+ /* append postfix to file to lock */
+ strcpy(lockfn, fn_to_lock);
+ strcat(lockfn, LF_POSTFIX);
+
+ return lockfn;
+}
+
+static char *create_lockfn_link(char *fn_to_lock, char *format,
+ char *uuid, int readonly)
+{
+ char hostname[128];
+ char *lockfn_link;
+ char *ptr;
+
+ /* get hostname */
+ if (unlikely(gethostname(hostname, sizeof(hostname)) == -1)) {
+ return 0;
+ }
+
+ /* allocate string to hold constructed lock file link */
+ lockfn_link = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) +
+ strlen(hostname) + strlen(uuid) + 8);
+ if (unlikely(!lockfn_link)) {
+ return 0;
+ }
+
+ /* construct lock file link with specific format */
+ strcpy(lockfn_link, fn_to_lock);
+ ptr = lockfn_link + strlen(lockfn_link);
+ sprintf(ptr, format, hostname, uuid, readonly ? "r" : "w");
+
+ return lockfn_link;
+}
+
+static int NFSnormalizedStatTime(char *fn, struct stat *statnow, int *reterrno)
+{
+ int result = LOCK_OK;
+ int uniq;
+ char *buf;
+ int fd;
+ int pid = (int)getpid();
+ int clstat;
+
+ *reterrno = 0;
+
+ /* create file to normalize time */
+ srandom((int)time(0) ^ pid);
+ uniq = random() % 0xffffff;
+ buf = malloc(strlen(fn) + 24);
+ if (unlikely(!buf)) { result = LOCK_ENOMEM; goto finish; }
+
+ strcpy(buf, fn);
+ sprintf(buf + strlen(buf), ".xen%08d.tmp", uniq);
+
+ fd = open(buf, O_WRONLY | O_CREAT, 0644);
+ if (fd == -1) { *reterrno = errno; result = LOCK_EOPEN; goto finish; }
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ if (lstat(buf, statnow) == -1) {
+ unlink(buf);
+ *reterrno = errno;
+ result = LOCK_ESTAT;
+ goto finish;
+ }
+ unlink(buf);
+
+finish:
+ return result;
+}
+
+static int writer_eval(char *name, int readonly)
+{
+ return name[strlen(name)-1] == 'w';
+}
+
+static int reader_eval(char *name, int readonly)
+{
+ return name[strlen(name)-1] == 'r' && !readonly;
+}
+
+static int lock_holder(char *fn, char *lockfn, char *lockfn_link,
+ int force, int readonly, int *stole, eval_func eval,
+ int *elt, int *ioerror)
+{
+ int status = 0;
+ int ustat;
+ DIR *pd = 0;
+ struct dirent *dptr;
+ char *ptr;
+ char *dirname = malloc(strlen(lockfn));
+ char *uname = malloc(strlen(lockfn_link) + 8);
+ int elt_established = 0;
+ int fd;
+ char tmpbuf[4096];
+
+ *stole = 0;
+ *ioerror = 0;
+ *elt = 0;
+
+ if (!dirname) goto finish;
+ if (!uname) goto finish;
+
+ /* get directory */
+ ptr = strrchr(lockfn, '/');
+ if (!ptr) {
+ strcpy(dirname, ".");
+ } else {
+ int numbytes = ptr - lockfn;
+ strncpy(dirname, lockfn, numbytes);
+ dirname[numbytes] = '\0';
+ }
+ pd = opendir(dirname);
+ if (!pd) {
+ *ioerror = errno ? errno : EIO;
+ goto finish;
+ }
+
+ /*
+ * scan through directory entries and use eval function
+ * if we have a match (i.e. reader or writer lock) but
+ * note that if we are forcing, we will remove any and
+ * all locks that appear for target of our lock, regardless
+ * if it a reader/writer owns the lock.
+ */
+ errno = 0;
+ dptr = readdir(pd);
+ if (!dptr) {
+ *ioerror = EIO;
+ }
+ while (dptr) {
+ char *p1 = strrchr(fn, '/');
+ char *p2 = strrchr(lockfn, '/');
+ char *p3 = strrchr(lockfn_link, '/');
+ if (p1) p1+=1;
+ if (p2) p2+=1;
+ if (p3) p3+=1;
+ if (strcmp(dptr->d_name, p1 ? p1 : fn) &&
+ strcmp(dptr->d_name, p2 ? p2 : lockfn) &&
+ strcmp(dptr->d_name, p3 ? p3 : lockfn_link) &&
+ !strncmp(dptr->d_name, p1 ? p1 : fn, strlen(p1?p1:fn))) {
+ strcpy(uname, dirname);
+ strcat(uname, "/");
+ strcat(uname, dptr->d_name);
+ if (!elt_established) {
+ /* read final lock file and extract lease time */
+ fd = open(uname, O_RDONLY, 0644);
+ memset(tmpbuf, 0, sizeof(tmpbuf));
+ if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+ *ioerror = errno;
+ status = 1;
+ close(fd);
+ goto finish;
+ }
+ close(fd);
+ ptr = strrchr(tmpbuf, '.');
+ if (ptr) {
+ *elt = atoi(ptr+1);
+ elt_established = 1;
+ }
+ }
+ if (force) {
+ ustat = unlink(uname);
+ if (ustat == -1) {
+ LOG("failed to unlink %s\n", uname);
+ }
+ *stole = 1;
+ *elt = 0;
+ } else {
+ if ((*eval)(dptr->d_name, readonly)) {
+ closedir(pd);
+ status = 1;
+ goto finish;
+ }
+ }
+ }
+ dptr = readdir(pd);
+ if (!dptr && errno) {
+ *ioerror = EIO;
+ }
+ }
+
+ closedir(pd);
+
+finish:
+ free(dirname);
+ free(uname);
+
+ /* if IO error, force a taken status */
+ return (*ioerror) ? 1 : status;
+}
+
+int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstatus)
+{
+ char *lockfn = 0;
+ char *lockfn_xlink = 0;
+ char *lockfn_flink = 0;
+ char *buf = 0;
+ int fd;
+ int status = 0;
+ struct stat stat1, stat2;
+ int retry_attempts = 0;
+ int clstat;
+ int tmpstat;
+ int stealx = 0;
+ int stealw = 0;
+ int stealr = 0;
+ int established_lease_time = 0;
+ char tmpbuf[4096];
+ int ioerr;
+
+ if (!fn_to_lock || !uuid) {
+ *retstatus = LOCK_EBADPARM;
+ return EINVAL;
+ }
+
+ *retstatus = 0;
+
+ /* seed random with time/pid combo */
+ srandom((int)time(0) ^ getpid());
+
+ /* build lock file strings */
+ lockfn = create_lockfn(fn_to_lock);
+ if (unlikely(!lockfn)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+ lockfn_xlink = create_lockfn_link(fn_to_lock, LFXL_FORMAT,
+ uuid, readonly);
+ if (unlikely(!lockfn_xlink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+ lockfn_flink = create_lockfn_link(fn_to_lock, LFFL_FORMAT, uuid,
+ readonly);
+ if (unlikely(!lockfn_flink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+try_again:
+ if (retry_attempts++ > RETRY_MAX) {
+ if (*retstatus == LOCK_EXLOCK_OPEN) {
+ struct stat statnow, stat_exlock;
+ int diff;
+
+ if (lstat(lockfn, &stat_exlock) == -1) {
+ goto finish;
+ }
+
+ if (NFSnormalizedStatTime(fn_to_lock, &statnow, &ioerr)) {
+ goto finish;
+ }
+
+ diff = (int)statnow.st_mtime - (int)stat_exlock.st_mtime;
+ if (diff > DEFAULT_LEASE_TIME_SECS) {
+ unlink(lockfn);
+ retry_attempts = 0;
+ goto try_again;
+ }
+ }
+ goto finish;
+ }
+
+ /* try to open exlusive lockfile */
+ fd = open(lockfn, O_WRONLY | O_CREAT | O_EXCL, 0644);
+ if (fd == -1) {
+ LOG("Initial lockfile creation failed %s force=%d, errno=%d\n",
+ lockfn, force, errno);
+ if (errno == EIO) {
+ *retstatus = LOCK_EXLOCK_OPEN;
+ status = EIO;
+ goto finish;
+ }
+ /* already owned? (hostname & uuid match, skip time bits) */
+ errno = 0;
+ fd = open(lockfn, O_RDWR, 0644);
+ if (fd != -1) {
+ buf = malloc(strlen(lockfn_xlink)+1);
+ if (!buf) {
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ *retstatus = LOCK_ENOMEM;
+ status = ENOMEM;
+ goto finish;
+ }
+ if (read(fd, buf, strlen(lockfn_xlink)) !=
+ (strlen(lockfn_xlink))) {
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ free(buf);
+ goto force_lock;
+ }
+ if (!strncmp(buf, lockfn_xlink, strlen(lockfn_xlink)-1)) {
+ LOG("lock owned by us, reasserting\n");
+ /* our lock, reassert by rewriting below */
+ if (lseek(fd, 0, SEEK_SET) == -1) {
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ goto force_lock;
+ }
+ free(buf);
+ goto skip;
+ }
+ free(buf);
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ }
+force_lock:
+ if (errno == EIO) {
+ *retstatus = LOCK_EXLOCK_OPEN;
+ status = EIO;
+ goto finish;
+ }
+ if (force) {
+ /* remove lock file, we are forcing lock, try again */
+ status = unlink(lockfn);
+ if (unlikely(status == -1)) {
+ if (errno == EIO) {
+ *retstatus = LOCK_EXLOCK_OPEN;
+ status = EIO;
+ goto finish;
+ }
+ LOG("force removal of %s lockfile failed, "
+ "errno=%d, trying again\n", lockfn, errno);
+ }
+ stealx = 1;
+ }
+ XSLEEP;
+ *retstatus = LOCK_EXLOCK_OPEN;
+ goto try_again;
+ }
+
+ LOG("lockfile created %s\n", lockfn);
+
+skip:
+ /*
+ * write into the temporary xlock
+ */
+ if (write(fd, lockfn_xlink, strlen(lockfn_xlink)) !=
+ strlen(lockfn_xlink)) {
+ if (errno == EIO) {
+ *retstatus = LOCK_EXLOCK_WRITE;
+ status = EIO;
+ goto finish;
+ }
+ status = errno;
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ XSLEEP;
+ *retstatus = LOCK_EXLOCK_WRITE;
+ if (unlink(lockfn) == -1) {
+ LOG("removal of %s lockfile failed, "
+ "errno=%d, trying again\n", lockfn, errno);
+ }
+ goto try_again;
+ }
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+
+ while (retry_attempts++ < RETRY_MAX) {
+ tmpstat = link(lockfn, lockfn_xlink);
+ LOG("linking %s and %s\n", lockfn, lockfn_xlink);
+ if ((tmpstat == -1) && (errno != EEXIST)) {
+ LOG("link status is %d, errno=%d\n", tmpstat, errno);
+ }
+
+ if ((lstat(lockfn, &stat1) == -1) ||
+ (lstat(lockfn_xlink, &stat2) == -1)) {
+ /* try again, cleanup first */
+ tmpstat = unlink(lockfn);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing lock file %s", lockfn);
+ }
+ tmpstat = unlink(lockfn_xlink);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing linked lock file %s",
+ lockfn_xlink);
+ }
+ XSLEEP;
+ status = LOCK_ESTAT;
+ goto finish;
+ }
+
+ /* compare inodes */
+ if (stat1.st_ino == stat2.st_ino) {
+ /* success, inodes are the same */
+ /* should we check that st_nlink's are also 2?? */
+ *retstatus = LOCK_OK;
+ status = 0;
+ tmpstat = unlink(lockfn_xlink);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing linked lock file %s",
+ lockfn_xlink);
+ }
+ goto finish;
+ } else {
+ status = errno;
+ /* try again, cleanup first */
+ tmpstat = unlink(lockfn);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing lock file %s", lockfn);
+ }
+ tmpstat = unlink(lockfn_xlink);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing linked lock file %s",
+ lockfn_xlink);
+ }
+ XSLEEP;
+ *retstatus = LOCK_EINODE;
+ goto try_again;
+ }
+ }
+
+finish:
+ if (!*retstatus) {
+
+ /* we have exclusive lock */
+
+ status = 0;
+
+ /* fast check, see if we own a final lock and are reasserting */
+ if (!lstat(lockfn_flink, &stat1)) {
+ char *ptr;
+
+ /* set the return value to notice this is a reassert */
+ *retstatus = 1;
+
+ /* read existing lock file and extract
+ established lease time */
+ fd = open(lockfn_flink, O_RDONLY, 0644);
+ memset(tmpbuf, 0, sizeof(tmpbuf));
+ if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+ if (errno == EIO) {
+ close(fd);
+ *retstatus = LOCK_EINODE;
+ status = EIO;
+ goto skip_scan;
+ }
+ }
+ close(fd);
+ ptr = strrchr(tmpbuf, '.');
+ if (ptr) {
+ *lease_time = atoi(ptr+1);
+ } else {
+ *lease_time = 10; /* wkchack */
+ }
+ goto skip_scan;
+ } else {
+ if (errno == EIO) {
+ *retstatus = LOCK_EINODE;
+ status = EIO;
+ goto skip_scan;
+ }
+ }
+
+ /* we allow exclusive writer, or multiple readers */
+ if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force,
+ readonly, &stealw, writer_eval,
+ &established_lease_time, &ioerr)) {
+ if (ioerr) {
+ *retstatus = LOCK_EREAD;
+ status = ioerr;
+ goto skip_scan;
+ }
+ *retstatus = LOCK_EHELD_WR;
+ } else if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force,
+ readonly, &stealr, reader_eval,
+ &established_lease_time, &ioerr)) {
+ if (ioerr) {
+ *retstatus = LOCK_EREAD;
+ status = ioerr;
+ goto skip_scan;
+ }
+ *retstatus = LOCK_EHELD_RD;
+ }
+ if (established_lease_time) *lease_time =
+ established_lease_time;
+ }
+
+skip_scan:
+ if (*retstatus >= 0) {
+ /* update file, changes last modify time */
+ fd = open(lockfn_flink, O_WRONLY | O_CREAT, 0644);
+ if (fd == -1) {
+ *retstatus = LOCK_EOPEN;
+ status = errno;
+ } else {
+ char tmpbuf[32];
+ int failed_write;
+ memset(tmpbuf, 0, sizeof(tmpbuf));
+ sprintf(tmpbuf, ".%d", *lease_time);
+ failed_write = write(fd, lockfn_flink,
+ strlen(lockfn_flink)) !=
+ strlen(lockfn_flink);
+ if (failed_write) status = errno;
+ failed_write |= write(fd, tmpbuf, strlen(tmpbuf)) !=
+ strlen(tmpbuf);
+ if (failed_write) status = errno;
+ if (failed_write) {
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ XSLEEP;
+ *retstatus = LOCK_EUPDATE;
+ goto try_again;
+ }
+ }
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ }
+
+ if (!*retstatus && force && (stealx || stealw || stealr)) {
+ struct timeval timeout;
+
+ /* enforce quiet time on steal */
+ timeout.tv_sec = *lease_time;
+ timeout.tv_usec = 0;
+ select(0, 0, 0, 0, &timeout);
+ }
+
+ /* remove exclusive lock, final read/write locks will hold */
+ tmpstat = unlink(lockfn);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing exclusive lock file %s",
+ lockfn);
+ }
+
+ free(lockfn);
+ free(lockfn_xlink);
+ free(lockfn_flink);
+
+ /* set lease time to -1 if error, so no one is apt to use it */
+ if (*retstatus < 0) *lease_time = -1;
+
+ LOG("returning status %d, errno=%d\n", status, errno);
+ return status;
+}
+
+
+int unlock(char *fn_to_unlock, char *uuid, int readonly, int *status)
+{
+ char *lockfn_link = 0;
+ int reterrno = 0;
+
+ if (!fn_to_unlock || !uuid) {
+ *status = LOCK_EBADPARM;
+ return 0;
+ }
+
+ lockfn_link = create_lockfn_link(fn_to_unlock, LFFL_FORMAT, uuid,
+ readonly);
+ if (unlikely(!lockfn_link)) { *status = LOCK_ENOMEM; goto finish; }
+
+ if (unlink(lockfn_link) == -1) {
+ LOG("error removing linked lock file %s", lockfn_link);
+ reterrno = errno;
+ *status = LOCK_ENOLOCK;
+ goto finish;
+ }
+
+ *status = LOCK_OK;
+
+finish:
+ free(lockfn_link);
+ return reterrno;
+}
+
+int lock_delta(char *fn, int *ret_lease, int *max_lease)
+{
+ int reterrno = 0;
+ DIR *pd = 0;
+ struct dirent *dptr;
+ char *ptr;
+ int result = INT_MAX;
+ struct stat statbuf, statnow;
+ char *dirname = malloc(strlen(fn));
+ char *uname = malloc(strlen(fn) + 8);
+ int elt_established = 0;
+ char *dotptr;
+ char tmpbuf[4096];
+ int fd;
+
+ if (!fn || !dirname || !uname) {
+ *ret_lease = LOCK_EBADPARM;
+ *max_lease = -1;
+ return 0;
+ }
+
+ if (NFSnormalizedStatTime(fn, &statnow, &reterrno)) {
+ result = LOCK_ESTAT;
+ goto finish;
+ }
+
+ /* get directory */
+ ptr = strrchr(fn, '/');
+ if (!ptr) {
+ strcpy(dirname, ".");
+ ptr = fn;
+ } else {
+ int numbytes = ptr - fn;
+ strncpy(dirname, fn, numbytes);
+ ptr += 1;
+ }
+ pd = opendir(dirname);
+ if (!pd) { reterrno = errno; goto finish; }
+
+ dptr = readdir(pd);
+ while (dptr) {
+ if (strcmp(dptr->d_name, ptr) &&
+ !strncmp(dptr->d_name, ptr, strlen(ptr))) {
+ char *fpath = malloc(strlen(dptr->d_name) +
+ strlen(dirname) + 2);
+ if (!fpath) {
+ closedir(pd);
+ result = LOCK_ENOMEM;
+ goto finish;
+ }
+ strcpy(fpath, dirname);
+ strcat(fpath, "/");
+ strcat(fpath, dptr->d_name);
+ if (lstat(fpath, &statbuf) != -1) {
+ int diff = (int)statnow.st_mtime -
+ (int)statbuf.st_mtime;
+ /* adjust diff if someone updated the lock
+ between now and when we created the "now"
+ file
+ */
+ diff = (diff < 0) ? 0 : diff;
+ result = diff < result ? diff : result;
+ } else {
+ closedir(pd);
+ reterrno = errno;
+ goto finish;
+ }
+
+ if (!elt_established) {
+ /* read final lock file and extract lease time */
+ fd = open(fpath, O_RDONLY, 0644);
+ memset(tmpbuf, 0, sizeof(tmpbuf));
+ if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+ /* error on read? */
+ }
+ close(fd);
+ dotptr = strrchr(tmpbuf, '.');
+ if (dotptr) {
+ *max_lease = atoi(dotptr+1);
+ elt_established = 1;
+ }
+ }
+
+ free(fpath);
+ }
+ dptr = readdir(pd);
+ }
+
+ closedir(pd);
+
+finish:
+ free(dirname);
+ free(uname);
+
+ /* returns smallest lock time, or error */
+ if (result == INT_MAX) result = LOCK_ENOLOCK;
+
+ /* set lease time to -1 if error, so no one is apt to use it */
+ if ((result < 0) || reterrno) *max_lease = -1;
+ *ret_lease = result;
+ return reterrno;
+}
+
+#if defined(TEST)
+/*
+ * the following is for sanity testing.
+ */
+
+static void usage(char *prg)
+{
+ printf("usage %s\n"
+ " dtr <filename>]\n"
+ " p <filename> [num iterations]\n"
+ " u <filename> [0|1] [<uniqid>]\n"
+ " l <filename> [0|1] [0|1] [<uniqid>] [<leasetime>]\n", prg);
+ printf(" p : perf test lock take and reassert\n");
+ printf(" d : delta lock time\n");
+ printf(" t : test the file (after random locks)\n");
+ printf(" r : random lock tests (must ^C)\n");
+ printf(" u : unlock, readonly? uniqID (default is PID)\n");
+ printf(" l : lock, readonly? force?, uniqID (default is PID), lease time\n");
+}
+
+static void test_file(char *fn)
+{
+ FILE *fptr;
+ int prev_count = 0;
+ int count, pid, time;
+
+ fptr = fopen(fn, "r");
+ if (!fptr) {
+ LOG("ERROR on file %s open, errno=%d\n", fn, errno);
+ return;
+ }
+
+ while (!feof(fptr)) {
+ fscanf(fptr, "%d %d %d\n", &count, &pid, &time);
+ if (prev_count != count) {
+ LOG("ERROR: prev_count=%d, count=%d, pid=%d, time=%d\n",
+ prev_count, count, pid, time);
+ }
+ prev_count = count + 1;
+ }
+}
+
+static void random_locks(char *fn)
+{
+ int pid = getpid();
+ int status;
+ char *filebuf = malloc(256);
+ int count = 0;
+ int dummy;
+ int clstat;
+ char uuid[12];
+ int readonly;
+ int lease = DEFAULT_LEASE_TIME_SECS;
+ int err;
+
+ /* this will never return, kill to exit */
+
+ srandom((int)time(0) ^ pid);
+
+ LOG("pid: %d using file %s\n", pid, fn);
+ sprintf(uuid, "%08d", pid);
+
+ while (1) {
+ XSLEEP;
+ readonly = random() & 1;
+ sysstatus = lock(fn, uuid, 0, readonly, &lease, status);
+ if (status == LOCK_OK) {
+ /* got lock, open, read, modify write close file */
+ int fd = open(fn, O_RDWR, 0644);
+ if (fd == -1) {
+ LOG("pid: %d ERROR on file %s open, errno=%d\n",
+ pid, fn, errno);
+ } else {
+ if (!readonly) {
+ /* ugly code to read data in test format */
+ /* format is "%d %d %d" 'count pid time' */
+ struct stat statbuf;
+ int bytes;
+ status = stat(fn, &statbuf);
+ if (status != -1) {
+ if (statbuf.st_size > 256) {
+ lseek(fd, -256, SEEK_END);
+ }
+ memset(filebuf, 0, 256);
+ bytes = read(fd, filebuf, 256);
+ if (bytes) {
+ int bw = bytes-2;
+ while (bw && filebuf[bw]!='\n')
+ bw--;
+ if (!bw) bw = -1;
+ sscanf(&filebuf[bw+1],
+ "%d %d %d",
+ &count, &dummy, &dummy);
+ count += 1;
+ }
+ lseek(fd, 0, SEEK_END);
+ sprintf(filebuf, "%d %d %d\n",
+ count, pid, (int)time(0));
+ write(fd, filebuf, strlen(filebuf));
+ } else {
+ LOG("pid: %d ERROR on file %s stat, "
+ "errno=%d\n", pid, fn, errno);
+ }
+ }
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ }
+ XSLEEP;
+ err = unlock(fn, uuid, readonly, &status);
+ LOG("unlock status is %d (err=%d)\n", status, err);
+ }
+ }
+}
+
+static void perf_lock(char *fn, int loops)
+{
+ int sysstatus;
+ char buf[9];
+ int start = loops;
+ int lease = DEFAULT_LEASE_TIME_SECS;
+
+ sprintf(buf, "%08d", getpid());
+
+ while (loops--) {
+ sysstatus = lock(fn, buf, 0, 0, &lease, &status);
+ if (status < 0) {
+ printf("failed to get lock at iteration %d errno=%d\n",
+ start - loops, errno);
+ return;
+ }
+ }
+ unlock(fn, buf, 0, &status);
+}
+
+int main(int argc, char *argv[])
+{
+ int status;
+ char *ptr;
+ char uuid[12];
+ int force;
+ int readonly;
+ int max_lease, cur_lease;
+ int intstatus;
+ int lease = DEFAULT_LEASE_TIME_SECS;
+
+ if (argc < 3) {
+ usage(argv[0]);
+ return 0;
+ }
+
+ sprintf(uuid, "%08d", getpid());
+ ptr = uuid;
+
+ if (!strcmp(argv[1],"d")) {
+ status = lock_delta(argv[2], &cur_lease, &max_lease);
+
+ printf("lock delta for %s is %d seconds, max lease is %d\n",
+ argv[2], cur_lease, max_lease);
+ } else if (!strcmp(argv[1],"t")) {
+ test_file(argv[2]);
+ } else if (!strcmp(argv[1],"r")) {
+ random_locks(argv[2]);
+ } else if (!strcmp(argv[1],"p")) {
+ perf_lock(argv[2], argc < 3 ? 100000 : atoi(argv[3]));
+ } else if (!strcmp(argv[1],"l")) {
+ if (argc < 4) force = 0; else force = atoi(argv[3]);
+ if (argc < 5) readonly = 0; else readonly = atoi(argv[4]);
+ if (argc >= 6) ptr = argv[5];
+ if (argc == 7) lease = atoi(argv[6]);
+ status = lock(argv[2], ptr, readonly, force, &lease, &intstatus);
+ printf("lock status = %d\n", status);
+ } else if (!strcmp(argv[1],"u") ) {
+ if (argc < 5) readonly = 0; else readonly = atoi(argv[3]);
+ if (argc == 5) ptr = argv[4];
+ status = unlock(argv[2], ptr, readonly, &intstatus);
+ printf("unlock status = %d\n", intstatus);
+ } else {
+ usage(argv[0]);
+ }
+
+ return status;
+}
+#elif defined(UTIL)
+/*
+ * the following is used for non-libary, standalone
+ * program utility as a shell program
+ */
+
+static void usage(char *prg)
+{
+ printf("usage %s\n"
+ " delta <filename>\n"
+ " unlock <filename> <r|w> <uniqid>\n"
+ " lock <filename> <r|w> <0|1> <uniqid> <leasetime>\n", prg);
+ printf(" delta : get time since lock last refreshed\n");
+ printf(" returns delta time and max lease time in seconds\n");
+ printf(" unlock: unlock request filename, r|w, uniqID\n");
+ printf(" returns status (success is 0)\n");
+ printf(" lock : lock request filename, r|w, force?, uniqID, lease time request\n");
+ printf(" returns status (success is 0) and established lease time in seconds\n");
+}
+
+int main(int argc, char *argv[])
+{
+ int status = 0;
+ int dlock;
+ char *ptr;
+ int force;
+ int readonly;
+ int cur_lease, max_lease, intstatus;
+ int lease = DEFAULT_LEASE_TIME_SECS;
+
+ if (argc < 3) {
+ if (argc == 2 && !strcmp(argv[1], "-h")) {
+ usage(argv[0]);
+ } else {
+ printf("%d\n", LOCK_EUSAGE);
+ }
+ return 0;
+ }
+
+ if (!strcmp(argv[1],"delta") && (argc == 3)) {
+ status = lock_delta(argv[2], &cur_lease, &max_lease);
+ printf("%d %d\n", cur_lease, max_lease);
+ } else if (!strcmp(argv[1],"lock") && (argc == 7)) {
+ readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0;
+ force = atoi(argv[4]);
+ ptr = argv[5];
+ lease = atoi(argv[6]);
+ status = lock(argv[2], ptr, force, readonly, &lease, &intstatus);
+ printf("%d %d\n", intstatus, lease);
+ } else if (!strcmp(argv[1],"unlock") && (argc == 5)) {
+ readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0;
+ ptr = argv[4];
+ status = unlock(argv[2], ptr, readonly, &intstatus);
+ printf("%d\n", intstatus);
+ } else {
+ printf("%d\n", LOCK_EUSAGE);
+ }
+
+ /* this is either 0 or a system defined errno */
+ return status;
+}
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define DEFAULT_LEASE_TIME_SECS 30
+
+int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstat);
+int unlock(char *fn_to_unlock, char *uuid, int readonly, int *retstat);
+int lock_delta(char *fn_to_check, int *cur_lease_time, int *max_lease_time);
+
+typedef enum {
+ LOCK_OK = 0,
+ LOCK_EBADPARM = -1,
+ LOCK_ENOMEM = -2,
+ LOCK_ESTAT = -3,
+ LOCK_EHELD_WR = -4,
+ LOCK_EHELD_RD = -5,
+ LOCK_EOPEN = -6,
+ LOCK_EXLOCK_OPEN = -7,
+ LOCK_EXLOCK_WRITE= -8,
+ LOCK_EINODE = -9,
+ LOCK_EUPDATE = -10,
+ LOCK_EREAD = -11,
+ LOCK_EREMOVE = -12,
+ LOCK_ENOLOCK = -13,
+ LOCK_EUSAGE = -14,
+} lock_error;
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* log.h: API for writelog communication */
+
+#ifndef __LOG_H__
+#define __LOG_H__ 1
+
+#include <inttypes.h>
+
+#include <xen/io/ring.h>
+/* for wmb et al */
+#include <xenctrl.h>
+
+#define LOGCMD_SHMP "shmp"
+#define LOGCMD_PEEK "peek"
+#define LOGCMD_CLEAR "clrw"
+#define LOGCMD_GET "getw"
+#define LOGCMD_KICK "kick"
+
+#define CTLRSPLEN_SHMP 256
+#define CTLRSPLEN_PEEK 4
+#define CTLRSPLEN_CLEAR 4
+#define CTLRSPLEN_GET 4
+#define CTLRSPLEN_KICK 0
+
+/* shmregion is arbitrarily capped at 8 megs for a minimum of
+ * 64 MB of data per read (if there are no contiguous regions)
+ * In the off-chance that there is more dirty data, multiple
+ * reads must be done */
+#define SHMSIZE (8 * 1024 * 1024)
+#define SRINGSIZE 4096
+
+/* The shared memory region is split up into 3 subregions:
+ * The first half is reserved for the dirty bitmap log.
+ * The second half begins with 1 page for read request descriptors,
+ * followed by a big area for supplying read data.
+ */
+static inline void* bmstart(void* shm)
+{
+ return shm;
+}
+
+static inline void* bmend(void* shm)
+{
+ return shm + SHMSIZE/2;
+}
+
+static inline void* sringstart(void* shm)
+{
+ return bmend(shm);
+}
+
+static inline void* sdatastart(void* shm)
+{
+ return sringstart(shm) + SRINGSIZE;
+}
+
+static inline void* sdataend(void* shm)
+{
+ return shm + SHMSIZE;
+}
+
+/* format for messages between log client and server */
+struct log_ctlmsg {
+ char msg[4];
+ char params[16];
+};
+
+/* extent descriptor */
+struct disk_range {
+ uint64_t sector;
+ uint32_t count;
+};
+
+/* dirty write logging space. This is an extent ring at the front,
+ * full of disk_ranges plus a pointer into the data area */
+/* I think I'd rather have the header in front of each data section to
+ * avoid having two separate spaces that can run out, but then I'd either
+ * lose page alignment on the data blocks or spend an entire page on the
+ * header */
+
+struct log_extent {
+ uint64_t sector;
+ uint32_t count;
+ uint32_t offset; /* offset from start of data area to start of extent */
+};
+
+/* struct above should be 16 bytes, or 256 extents/page */
+
+typedef struct log_extent log_request_t;
+typedef struct log_extent log_response_t;
+
+DEFINE_RING_TYPES(log, log_request_t, log_response_t);
+
+#define LOG_HEADER_PAGES 4
+
+#endif
--- /dev/null
+/* start - public domain MD5 implementation */
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest. This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ */
+
+#include <string.h>
+#include <stdint.h>
+
+struct MD5Context {
+ uint32_t buf[4];
+ uint32_t bits[2];
+ uint8_t in[64];
+};
+
+static void MD5Init(struct MD5Context *context);
+static void MD5Update(struct MD5Context *context, unsigned char const *buf,
+ unsigned len);
+static void MD5Final(unsigned char digest[16], struct MD5Context *context);
+static void MD5Transform(uint32_t buf[4], uint32_t const in[16]);
+
+
+typedef struct MD5Context MD5_CTX;
+
+
+/**
+ * md5_sum - MD5 hash for a data block
+ * @addr: Pointers to the data area
+ * @len: Lengths of the data block
+ * @mac: Buffer for the hash
+ */
+void md5_sum(const uint8_t *addr, const size_t len, uint8_t *mac)
+{
+ MD5_CTX ctx;
+
+ MD5Init(&ctx);
+ MD5Update(&ctx, addr, len);
+ MD5Final(mac, &ctx);
+}
+
+
+#ifndef WORDS_BIGENDIAN
+#define byteReverse(buf, len) /* Nothing */
+#else
+/*
+ * Note: this code is harmless on little-endian machines.
+ */
+static void byteReverse(unsigned char *buf, unsigned longs)
+{
+ uint32_t t;
+ do {
+ t = (uint32_t) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
+ ((unsigned) buf[1] << 8 | buf[0]);
+ *(uint32_t *) buf = t;
+ buf += 4;
+ } while (--longs);
+}
+#endif
+
+/*
+ * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(struct MD5Context *ctx)
+{
+ ctx->buf[0] = 0x67452301;
+ ctx->buf[1] = 0xefcdab89;
+ ctx->buf[2] = 0x98badcfe;
+ ctx->buf[3] = 0x10325476;
+
+ ctx->bits[0] = 0;
+ ctx->bits[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
+{
+ uint32_t t;
+
+ /* Update bitcount */
+
+ t = ctx->bits[0];
+ if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t)
+ ctx->bits[1]++; /* Carry from low to high */
+ ctx->bits[1] += len >> 29;
+
+ t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
+
+ /* Handle any leading odd-sized chunks */
+
+ if (t) {
+ unsigned char *p = (unsigned char *) ctx->in + t;
+
+ t = 64 - t;
+ if (len < t) {
+ memcpy(p, buf, len);
+ return;
+ }
+ memcpy(p, buf, t);
+ byteReverse(ctx->in, 16);
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+ buf += t;
+ len -= t;
+ }
+ /* Process data in 64-byte chunks */
+
+ while (len >= 64) {
+ memcpy(ctx->in, buf, 64);
+ byteReverse(ctx->in, 16);
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+ buf += 64;
+ len -= 64;
+ }
+
+ /* Handle any remaining bytes of data. */
+
+ memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(unsigned char digest[16], struct MD5Context *ctx)
+{
+ unsigned count;
+ unsigned char *p;
+
+ /* Compute number of bytes mod 64 */
+ count = (ctx->bits[0] >> 3) & 0x3F;
+
+ /* Set the first char of padding to 0x80. This is safe since there is
+ always at least one byte free */
+ p = ctx->in + count;
+ *p++ = 0x80;
+
+ /* Bytes of padding needed to make 64 bytes */
+ count = 64 - 1 - count;
+
+ /* Pad out to 56 mod 64 */
+ if (count < 8) {
+ /* Two lots of padding: Pad the first block to 64 bytes */
+ memset(p, 0, count);
+ byteReverse(ctx->in, 16);
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+
+ /* Now fill the next block with 56 bytes */
+ memset(ctx->in, 0, 56);
+ } else {
+ /* Pad block to 56 bytes */
+ memset(p, 0, count - 8);
+ }
+ byteReverse(ctx->in, 14);
+
+ /* Append length in bits and transform */
+ ((uint32_t *) ctx->in)[14] = ctx->bits[0];
+ ((uint32_t *) ctx->in)[15] = ctx->bits[1];
+
+ MD5Transform(ctx->buf, (uint32_t *) ctx->in);
+ byteReverse((unsigned char *) ctx->buf, 4);
+ memcpy(digest, ctx->buf, 16);
+ memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
+}
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, data, s) \
+ ( w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x )
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data. MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+static void MD5Transform(uint32_t buf[4], uint32_t const in[16])
+{
+ register uint32_t a, b, c, d;
+
+ a = buf[0];
+ b = buf[1];
+ c = buf[2];
+ d = buf[3];
+
+ MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+ MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+ MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+ MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+ MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+ MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+ MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+ MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+ MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+ MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+ MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+ MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+ MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+ MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+ MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+ MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+ MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+ MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+ MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+ MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+ MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+ MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+ MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+ MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+ MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+ MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+ MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+ MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+ MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+ MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+ MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+ MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+ MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+ MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+ MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+ MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+ MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+ MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+ MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+ MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+ MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+ MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+ MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+ MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+ MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+ MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+ MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+ MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+ MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+ MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+ MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+ MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+ MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+ MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+ MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+ MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+ MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+ MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+ MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+ MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+ MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+ MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+ MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+ MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+ buf[0] += a;
+ buf[1] += b;
+ buf[2] += c;
+ buf[3] += d;
+}
--- /dev/null
+#ifndef MD5_H
+#define MD5_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+/**
+ * md5_sum - MD5 hash for a data block
+ * @addr: Pointers to the data area
+ * @len: Lengths of the data block
+ * @mac: Buffer for the hash
+ */
+void md5_sum(const uint8_t *addr, const size_t len, uint8_t *mac);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __TAP_PROFILE_H__
+#define __TAP_PROFILE_H__
+
+#ifndef _GNU_SOURCE
+ #define _GNU_SOURCE
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <sys/time.h>
+#include <time.h>
+#include <fcntl.h>
+#include <inttypes.h>
+
+//#define PROFILING
+//#define LOGGING
+
+#define TAPPROF_IN 1
+#define TAPPROF_OUT 2
+
+struct profile_times {
+ char *fn_name;
+ uint64_t in, out_sum, cnt;
+};
+
+struct profile_info {
+ FILE *log;
+ int size;
+ char *name;
+ unsigned long long seq;
+ struct profile_times *pt;
+};
+
+#ifdef PROFILING
+
+static inline void
+tp_open(struct profile_info *prof, char *tap_name, char *log_name, int size)
+{
+ memset(prof, 0, sizeof(struct profile_info));
+#ifdef LOGGING
+ prof->log = fopen(log_name, "w");
+#endif
+ prof->size = size;
+ prof->name = strdup(tap_name);
+ prof->pt = malloc(sizeof(struct profile_times) * prof->size);
+ if (prof->pt)
+ memset(prof->pt, 0, sizeof(struct profile_times) * prof->size);
+}
+
+static inline void
+tp_close(struct profile_info *prof)
+{
+ int i;
+ struct profile_times *pt;
+
+ for (i = 0; i < prof->size; i++) {
+ pt = &prof->pt[i];
+ if (pt->fn_name) {
+ syslog(LOG_DEBUG, "%s: %s: cnt: %llu, avg time: %llu\n",
+ prof->name, pt->fn_name, pt->cnt,
+ ((pt->cnt) ? (pt->out_sum / pt->cnt) : 0));
+ free(pt->fn_name);
+ }
+ }
+
+#ifdef LOGGING
+ if (prof->log)
+ fclose(prof->log);
+#endif
+ free(prof->name);
+ if (prof->pt)
+ free(prof->pt);
+}
+
+static inline u64
+tp_get_id(struct profile_info *prof)
+{
+ return prof->seq++;
+}
+
+static inline int
+tp_fn_id(struct profile_info *prof, const char *name)
+{
+ int i;
+ struct profile_times *pt;
+
+ for (i = 0; i < prof->size; i++) {
+ pt = &prof->pt[i];
+ if (!pt->fn_name)
+ return i;
+ if (!strcmp(pt->fn_name, name))
+ return i;
+ }
+
+ return prof->size - 1;
+}
+
+static inline void
+__tp_in(struct profile_info *prof, const char *func)
+{
+ long long _time;
+ int idx = tp_fn_id(prof, func);
+ struct profile_times *pt = &prof->pt[idx];
+
+ if (!pt->fn_name)
+ pt->fn_name = strdup(func);
+
+ asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+ pt->in = _time;
+}
+
+#define tp_in(prof) __tp_in(prof, __func__)
+
+static inline void
+__tp_out(struct profile_info *prof, const char *func)
+{
+ long long _time;
+ int idx = tp_fn_id(prof, func);
+ struct profile_times *pt = &prof->pt[idx];
+
+ if (!pt->fn_name || !pt->in)
+ return;
+
+ asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+ pt->cnt++;
+ pt->out_sum += (_time - pt->in);
+ pt->in = 0;
+}
+
+#define tp_out(prof) __tp_out(prof, __func__)
+
+static inline void
+__tp_log(struct profile_info *prof, u64 id, const char *func, int direction)
+{
+ long long _time;
+ asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+
+ if (direction == TAPPROF_IN)
+ __tp_in(prof, func);
+ else
+ __tp_out(prof, func);
+
+#ifdef LOGGING
+ if (prof->log)
+ fprintf(prof->log, "%s: %s: %llu, %lld\n", func,
+ ((direction == TAPPROF_IN) ? "in" : "out"), id, _time);
+#endif
+}
+
+#define tp_log(prof, id, direction) __tp_log(prof, id, __func__, direction)
+
+#else
+#define tp_open(prof, tname, lname, size) ((void)0)
+#define tp_close(prof) ((void)0)
+#define tp_in(prof) ((void)0)
+#define tp_out(prof) ((void)0)
+#define tp_log(prof, sec, direction) ((void)0)
+#endif
+
+#endif
--- /dev/null
+/* qcow-create.c
+ *
+ * Generates a qcow format disk.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include "tapdisk.h"
+#include "qcow.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define MAX_NAME_LEN 1000
+
+void help(void)
+{
+ fprintf(stderr, "Qcow-utils: v1.0.0\n");
+ fprintf(stderr,
+ "usage: qcow-create [-h help] [-r reserve] <SIZE(MB)> <FILENAME> "
+ "[<BACKING_FILENAME>]\n");
+ exit(-1);
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = -1, c, backed = 0;
+ int sparse = 1;
+ uint64_t size;
+ char filename[MAX_NAME_LEN], bfilename[MAX_NAME_LEN];
+
+ for(;;) {
+ c = getopt(argc, argv, "hr");
+ if (c == -1)
+ break;
+ switch(c) {
+ case 'h':
+ help();
+ exit(0);
+ break;
+ case 'r':
+ sparse = 0;
+ break;
+ default:
+ fprintf(stderr, "Unknown option\n");
+ help();
+ }
+ }
+
+ printf("Optind %d, argc %d\n", optind, argc);
+ if ( !(optind == (argc - 2) || optind == (argc - 3)) )
+ help();
+
+ size = atoi(argv[optind++]);
+ size = size << 20;
+
+ if (snprintf(filename, MAX_NAME_LEN, "%s",argv[optind++]) >=
+ MAX_NAME_LEN) {
+ fprintf(stderr,"Device name too long\n");
+ exit(-1);
+ }
+
+ if (optind != argc) {
+ /*Backing file argument*/
+ backed = 1;
+ if (snprintf(bfilename, MAX_NAME_LEN, "%s",argv[optind++]) >=
+ MAX_NAME_LEN) {
+ fprintf(stderr,"Device name too long\n");
+ exit(-1);
+ }
+ }
+
+ DFPRINTF("Creating file size %"PRIu64", name %s\n",(uint64_t)size, filename);
+ if (!backed)
+ ret = qcow_create(filename,size,NULL,sparse);
+ else
+ ret = qcow_create(filename,size,bfilename,sparse);
+
+ if (ret < 0)
+ DPRINTF("Unable to create QCOW file\n");
+ else
+ DPRINTF("QCOW file successfully created\n");
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _QCOW_H_
+#define _QCOW_H_
+
+#include "aes.h"
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define XEN_MAGIC (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0x00
+#define QCOW_CRYPT_AES 0x01
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+#define SPARSE_FILE 0x01
+#define EXTHDR_L1_BIG_ENDIAN 0x02
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+#define ROUNDUP(l, s) \
+({ \
+ (uint64_t)( \
+ (l + (s - 1)) - ((l + (s - 1)) % s)); \
+})
+
+typedef struct QCowHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t backing_file_offset;
+ uint32_t backing_file_size;
+ uint32_t mtime;
+ uint64_t size; /* in bytes */
+ uint8_t cluster_bits;
+ uint8_t l2_bits;
+ uint32_t crypt_method;
+ uint64_t l1_table_offset;
+} QCowHeader;
+
+/*Extended header for Xen enhancements*/
+typedef struct QCowHeader_ext {
+ uint32_t xmagic;
+ uint32_t cksum;
+ uint32_t min_cluster_alloc;
+ uint32_t flags;
+} QCowHeader_ext;
+
+uint32_t gen_cksum(char *ptr, int len);
+int get_filesize(char *filename, uint64_t *size, struct stat *st);
+int qtruncate(int fd, off_t length, int sparse);
+
+#define L2_CACHE_SIZE 16 /*Fixed allocation in Qemu*/
+
+struct tdqcow_state {
+ int fd; /*Main Qcow file descriptor */
+ uint64_t fd_end; /*Store a local record of file length */
+ char *name; /*Record of the filename*/
+ uint32_t backing_file_size;
+ uint64_t backing_file_offset;
+ uint8_t extended; /*File contains extended header*/
+ int encrypted; /*File contents are encrypted or plain*/
+ int cluster_bits; /*Determines length of cluster as
+ *indicated by file hdr*/
+ int cluster_size; /*Length of cluster*/
+ int cluster_sectors; /*Number of sectors per cluster*/
+ int cluster_alloc; /*Blktap fix for allocating full
+ *extents*/
+ int min_cluster_alloc; /*Blktap historical extent alloc*/
+ int sparse; /*Indicates whether to preserve sparseness*/
+ int l2_bits; /*Size of L2 table entry*/
+ int l2_size; /*Full table size*/
+ int l1_size; /*L1 table size*/
+ uint64_t cluster_offset_mask;
+ uint64_t l1_table_offset; /*L1 table offset from beginning of
+ *file*/
+ uint64_t *l1_table; /*L1 table entries*/
+ uint64_t *l2_cache; /*We maintain a cache of size
+ *L2_CACHE_SIZE of most read entries*/
+ uint64_t l2_cache_offsets[L2_CACHE_SIZE]; /*L2 cache entries*/
+ uint32_t l2_cache_counts[L2_CACHE_SIZE]; /*Cache access record*/
+ uint8_t *cluster_cache;
+ uint8_t *cluster_data;
+ uint64_t cluster_cache_offset; /**/
+ uint32_t crypt_method; /*current crypt method, 0 if no
+ *key yet */
+ uint32_t crypt_method_header; /**/
+ AES_KEY aes_encrypt_key; /*AES key*/
+ AES_KEY aes_decrypt_key; /*AES key*/
+
+ /* libaio state */
+ int aio_free_count;
+ int max_aio_reqs;
+ struct qcow_request *aio_requests;
+ struct qcow_request **aio_free_list;
+
+};
+
+int qcow_create(const char *filename, uint64_t total_size,
+ const char *backing_file, int sparse);
+
+#endif //_QCOW_H_
--- /dev/null
+/* qcow2raw.c
+ *
+ * Generates raw image data from an existing qcow image
+ *
+ * (c) 2006 Julian Chesterfield and Andrew Warfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+
+#include "bswap.h"
+#include "aes.h"
+#include "blk.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+#include "qcow.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+#define BLOCK_PROCESSSZ 4096
+#define QCOW_VBD 0
+#define AIO_VBD 1
+#define WINDOW 32
+#define PROGRESS_QUANT 2
+
+static int running = 1, complete = 0;
+static int returned_read_events = 0, returned_write_events = 0;
+static int submit_events = 0;
+static uint32_t read_idx = 0;
+td_driver_t *ddqcow, *ddaio;
+td_vbd_t* qcow_vbd, *aio_vbd;
+static uint64_t prev = 0, written = 0;
+static char output[(100/PROGRESS_QUANT) + 5];
+
+extern tapdisk_server_t server;
+
+struct request_info {
+ void* buf;
+ uint64_t logical_sec;
+ int pending;
+};
+
+static void print_bytes(void *ptr, int length)
+{
+ int i,k;
+ unsigned char *p = ptr;
+
+ DFPRINTF("Buf dump, length %d:\n",length);
+ for (k = 0; k < length; k++) {
+ DFPRINTF("%x",*p);
+ *p++;
+ if (k % 16 == 0) DFPRINTF("\n");
+ else if (k % 2 == 0) DFPRINTF(" ");
+ }
+ DFPRINTF("\n");
+ return;
+}
+
+static void debug_output(uint64_t progress, uint64_t size)
+{
+ //Output progress every PROGRESS_QUANT
+ uint64_t blocks = size/(100/PROGRESS_QUANT);
+
+ if (progress/blocks > prev) {
+ memcpy(output+prev+1,"=>",2);
+ prev++;
+ DFPRINTF("\r%s %"PRIu64"%%",
+ output, (uint64_t)((prev-1)*PROGRESS_QUANT));
+ }
+ return;
+}
+
+static void send_write_responses(td_request_t treq, int err)
+{
+ struct request_info* req;
+
+ if (err < 0) {
+ DFPRINTF("AIO FAILURE: res [%d]!\n",err);
+ return;
+ }
+ returned_write_events+=treq.secs;
+ written += treq.secs;
+
+ req= (struct request_info*)treq.cb_data;
+
+ //Wait for whole request to complete.
+ req->pending-=treq.secs;
+ if(req->pending)
+ return;
+
+ //Whole request has completed, we can free buffers.
+ free(req->buf);
+ free(req);
+
+ debug_output(written, ddaio->info.size);
+
+ return;
+}
+
+static void send_read_responses(td_request_t treq, int err)
+{
+ int ret;
+ struct request_info* req;
+ td_vbd_request_t* vreq;
+
+ if (err < 0) {
+ DFPRINTF("AIO FAILURE: res [%d]!\n",err);
+ return;
+ }
+ returned_read_events+=treq.secs;
+
+ req= (struct request_info*)treq.cb_data;
+
+ //do nothing until all fragments complete.
+ req->pending-=treq.secs;
+
+ if(req->pending)
+ return;
+
+ //This read is done.
+ tapdisk_vbd_complete_vbd_request(qcow_vbd, treq.private);
+
+
+ treq.op = TD_OP_WRITE;
+ treq.buf = req->buf;
+ treq.sec = req->logical_sec;
+ treq.secs = BLOCK_PROCESSSZ>>9;
+ treq.image = tapdisk_vbd_first_image(aio_vbd);
+ treq.cb = send_write_responses;
+ treq.id = 0;
+ treq.sidx = 0;
+
+ req->pending = BLOCK_PROCESSSZ>>9;
+ treq.cb_data = req;
+
+ vreq = calloc(1, sizeof(td_vbd_request_t));
+ treq.private = vreq;
+
+ //Put it in the VBD's queue, so we don't lose
+ //track of it.
+ vreq->submitting = 1;
+ INIT_LIST_HEAD(&vreq->next);
+ tapdisk_vbd_move_request(treq.private,
+ &aio_vbd->pending_requests);
+
+ ddaio->ops->td_queue_write(ddaio,treq);
+ --vreq->submitting;
+
+ tapdisk_submit_all_tiocbs(&server.aio_queue);
+
+ return;
+}
+
+int main(int argc, const char *argv[])
+{
+ int ret = -1, fd, len,input;
+ uint64_t size;
+ struct timeval timeout;
+ uint64_t i;
+ char *buf = NULL;
+ struct stat finfo;
+ td_request_t treq;
+ td_vbd_request_t* vreq;
+ struct request_info* req;
+ int err;
+
+ if (argc != 3) {
+ fprintf(stderr, "Qcow-utils: v1.0.0\n");
+ fprintf(stderr, "usage: %s <Dest File descriptor> "
+ "<Qcow SRC IMAGE>\n",
+ argv[0]);
+ exit(-1);
+ }
+
+ err = tapdisk_server_initialize();
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't initialize server instance.\n");
+ return err;
+ }
+
+ err=tapdisk_vbd_initialize(QCOW_VBD);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't initialize qcow vbd.\n");
+ return err;
+ }
+
+ qcow_vbd = tapdisk_server_get_vbd(QCOW_VBD);
+ if (!qcow_vbd) {
+ err = -ENODEV;
+ DPRINTF("qcow2raw Couldn't create qcow vbd.\n");
+ return err;
+ }
+
+ err = tapdisk_vbd_open_vdi(qcow_vbd, argv[2], DISK_TYPE_QCOW,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ TD_OPEN_RDONLY);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't open qcow file.\n");
+ return err;
+ }
+
+ ddqcow=(tapdisk_vbd_first_image(qcow_vbd))->driver;
+
+ /*Setup aio destination file*/
+ ret = stat(argv[1],&finfo);
+ if (ret == -1) {
+ /*Check errno*/
+ switch(errno) {
+ case ENOENT:
+ /*File doesn't exist, create*/
+ fd = open(argv[1],
+ O_RDWR | O_LARGEFILE | O_CREAT, 0644);
+ if (fd < 0) {
+ DFPRINTF("ERROR creating file [%s] "
+ "(errno %d)\n",
+ argv[1], 0 - errno);
+ exit(-1);
+ }
+ if (ftruncate(fd, (off_t)ddqcow->info.size<<9) < 0) {
+ DFPRINTF("Unable to create file "
+ "[%s] of size %"PRIu64" (errno %d). "
+ "Exiting...\n",
+ argv[1],
+ (uint64_t)ddqcow->info.size<<9,
+ 0 - errno);
+ close(fd);
+ exit(-1);
+ }
+ close(fd);
+ break;
+ case ENXIO:
+ DFPRINTF("ERROR Device [%s] does not exist\n",argv[1]);
+ exit(-1);
+ default:
+ DFPRINTF("An error occurred opening Device [%s] "
+ "(errno %d)\n",
+ argv[1], 0 - errno);
+ exit(-1);
+ }
+ } else {
+ fprintf(stderr, "WARNING: All existing data in "
+ "%s will be overwritten.\nDo you wish to continue? "
+ "(y or n) ",
+ argv[1]);
+ if (getchar() != 'y') {
+ DFPRINTF("Exiting...\n");
+ exit(-1);
+ }
+
+ /*TODO - Test the existing file or device for adequate space*/
+ fd = open(argv[1], O_RDWR | O_LARGEFILE);
+ if (fd < 0) {
+ DFPRINTF("ERROR: opening file [%s] (errno %d)\n",
+ argv[1], 0 - errno);
+ exit(-1);
+ }
+
+ if (S_ISBLK(finfo.st_mode)) {
+ if (blk_getimagesize(fd, &size) != 0) {
+ close(fd);
+ return -1;
+ }
+
+ if (size < ddqcow->info.size<<9) {
+ DFPRINTF("ERROR: Not enough space on device "
+ "%s (%"PRIu64" bytes available, "
+ "%"PRIu64" bytes required\n",
+ argv[1], size,
+ (uint64_t)ddqcow->info.size<<9);
+ close(fd);
+ exit(-1);
+ }
+ } else {
+ if (ftruncate(fd, (off_t)ddqcow->info.size<<9) < 0) {
+ DFPRINTF("Unable to create file "
+ "[%s] of size %"PRIu64" (errno %d). "
+ "Exiting...\n",
+ argv[1],
+ (uint64_t)ddqcow->info.size<<9,
+ 0 - errno);
+ close(fd);
+ exit(-1);
+ } else DFPRINTF("File [%s] truncated to length %"PRIu64" "
+ "(%"PRIu64")\n",
+ argv[1],
+ (uint64_t)ddqcow->info.size<<9,
+ (uint64_t)ddqcow->info.size);
+ }
+ close(fd);
+ }
+
+ //Now the output file should be there, reopen it as an aio VBD
+ err=tapdisk_vbd_initialize(AIO_VBD);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't initialize aio vbd.\n");
+ return err;
+ }
+
+ aio_vbd = tapdisk_server_get_vbd(AIO_VBD);
+ if (!aio_vbd) {
+ err = -ENODEV;
+ DPRINTF("qcow2raw Couldn't create aio vbd.\n");
+ return err;
+ }
+
+ err = tapdisk_vbd_open_vdi(aio_vbd, argv[1], DISK_TYPE_AIO,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ 0);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't open aio file.\n");
+ return err;
+ }
+
+ ddaio=(tapdisk_vbd_first_image(aio_vbd))->driver;
+
+ /*Initialise the output string*/
+ memset(output,0x20,(100/PROGRESS_QUANT)+5);
+ output[0] = '[';
+ output[(100/PROGRESS_QUANT)+2] = ']';
+ output[(100/PROGRESS_QUANT)+3] = '\0';
+ DFPRINTF("%s",output);
+
+ i = 0;
+ while (running) {
+ timeout.tv_sec = 0;
+
+ if (!complete) {
+ /*Read Pages from qcow image*/
+ if ( (ret = posix_memalign((void **)&buf,
+ BLOCK_PROCESSSZ,
+ BLOCK_PROCESSSZ))
+ != 0) {
+ DFPRINTF("Unable to alloc memory (%d)\n",ret);
+ exit(-1);
+ }
+
+ /*Attempt to read 4k sized blocks*/
+ submit_events+=BLOCK_PROCESSSZ>>9;
+
+ //Set up the read request
+ treq.op = TD_OP_READ;
+ treq.buf = buf;
+ treq.sec = i;
+ treq.secs = BLOCK_PROCESSSZ>>9;
+ treq.image = tapdisk_vbd_first_image(qcow_vbd);
+ treq.cb = send_read_responses;
+ treq.id = 0;
+ treq.sidx = 0;
+
+ req = calloc(1, sizeof(struct request_info));
+ req->buf = buf;
+ req->logical_sec = i;
+ req->pending = BLOCK_PROCESSSZ>>9;
+ treq.cb_data = req;
+
+ vreq = calloc(1, sizeof(td_vbd_request_t));
+ treq.private = vreq;
+
+ //Put it in the VBD's queue, so we don't lose
+ //track of it.
+ vreq->submitting = 1;
+ INIT_LIST_HEAD(&vreq->next);
+ tapdisk_vbd_move_request(treq.private,
+ &qcow_vbd->pending_requests);
+
+ ddqcow->ops->td_queue_read(ddqcow, treq);
+ --vreq->submitting;
+
+ i += BLOCK_PROCESSSZ>>9;
+
+ if (i >= ddqcow->info.size)
+ complete = 1;
+
+
+ tapdisk_submit_all_tiocbs(&server.aio_queue);
+ }
+
+
+ while(returned_write_events != submit_events) {
+ ret = scheduler_wait_for_events(&server.scheduler);
+ if (ret < 0) {
+ DFPRINTF("server wait returned %d\n", ret);
+ sleep(2);
+ }
+ }
+ if (complete && (returned_write_events == submit_events))
+ running = 0;
+ }
+ memcpy(output+prev+1,"=",1);
+ DFPRINTF("\r%s 100%%\nTRANSFER COMPLETE\n\n", output);
+
+ ddqcow->ops->td_close(ddqcow);
+ ddaio->ops->td_close(ddaio);
+ free(ddqcow->data);
+ free(ddaio->data);
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "scheduler.h"
+#include "tapdisk-log.h"
+
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+
+#define SCHEDULER_MAX_TIMEOUT 600
+#define SCHEDULER_POLL_FD (SCHEDULER_POLL_READ_FD | \
+ SCHEDULER_POLL_WRITE_FD | \
+ SCHEDULER_POLL_EXCEPT_FD)
+
+#define MIN(a, b) ((a) <= (b) ? (a) : (b))
+#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+
+#define scheduler_for_each_event(s, event, tmp) \
+ list_for_each_entry_safe(event, tmp, &(s)->events, next)
+
+typedef struct event {
+ char mode;
+ event_id_t id;
+
+ int fd;
+ int timeout;
+ int deadline;
+
+ event_cb_t cb;
+ void *private;
+
+ struct list_head next;
+} event_t;
+
+static void
+scheduler_prepare_events(scheduler_t *s)
+{
+ int diff;
+ struct timeval now;
+ event_t *event, *tmp;
+
+ FD_ZERO(&s->read_fds);
+ FD_ZERO(&s->write_fds);
+ FD_ZERO(&s->except_fds);
+
+ s->max_fd = 0;
+ s->timeout = SCHEDULER_MAX_TIMEOUT;
+
+ gettimeofday(&now, NULL);
+
+ scheduler_for_each_event(s, event, tmp) {
+ if (event->mode & SCHEDULER_POLL_READ_FD) {
+ FD_SET(event->fd, &s->read_fds);
+ s->max_fd = MAX(event->fd, s->max_fd);
+ }
+
+ if (event->mode & SCHEDULER_POLL_WRITE_FD) {
+ FD_SET(event->fd, &s->write_fds);
+ s->max_fd = MAX(event->fd, s->max_fd);
+ }
+
+ if (event->mode & SCHEDULER_POLL_EXCEPT_FD) {
+ FD_SET(event->fd, &s->except_fds);
+ s->max_fd = MAX(event->fd, s->max_fd);
+ }
+
+ if (event->mode & SCHEDULER_POLL_TIMEOUT) {
+ diff = event->deadline - now.tv_sec;
+ if (diff > 0)
+ s->timeout = MIN(s->timeout, diff);
+ else
+ s->timeout = 0;
+ }
+ }
+
+ s->timeout = MIN(s->timeout, s->max_timeout);
+}
+
+static void
+scheduler_event_callback(event_t *event, char mode)
+{
+ if (event->mode & SCHEDULER_POLL_TIMEOUT) {
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ event->deadline = now.tv_sec + event->timeout;
+ }
+
+ event->cb(event->id, mode, event->private);
+}
+
+static void
+scheduler_run_events(scheduler_t *s)
+{
+ struct timeval now;
+ event_t *event, *tmp;
+
+ gettimeofday(&now, NULL);
+
+ again:
+ s->restart = 0;
+
+ scheduler_for_each_event(s, event, tmp) {
+ if ((event->mode & SCHEDULER_POLL_READ_FD) &&
+ FD_ISSET(event->fd, &s->read_fds)) {
+ FD_CLR(event->fd, &s->read_fds);
+ scheduler_event_callback(event, SCHEDULER_POLL_READ_FD);
+ goto next;
+ }
+
+ if ((event->mode & SCHEDULER_POLL_WRITE_FD) &&
+ FD_ISSET(event->fd, &s->write_fds)) {
+ FD_CLR(event->fd, &s->write_fds);
+ scheduler_event_callback(event, SCHEDULER_POLL_WRITE_FD);
+ goto next;
+ }
+
+ if ((event->mode & SCHEDULER_POLL_EXCEPT_FD) &&
+ FD_ISSET(event->fd, &s->except_fds)) {
+ FD_CLR(event->fd, &s->except_fds);
+ scheduler_event_callback(event, SCHEDULER_POLL_EXCEPT_FD);
+ goto next;
+ }
+
+ if ((event->mode & SCHEDULER_POLL_TIMEOUT) &&
+ (event->deadline <= now.tv_sec))
+ scheduler_event_callback(event, SCHEDULER_POLL_TIMEOUT);
+
+ next:
+ if (s->restart)
+ goto again;
+ }
+}
+
+int
+scheduler_register_event(scheduler_t *s, char mode, int fd,
+ int timeout, event_cb_t cb, void *private)
+{
+ event_t *event;
+ struct timeval now;
+
+ if (!cb)
+ return -EINVAL;
+
+ if (!(mode & SCHEDULER_POLL_TIMEOUT) && !(mode & SCHEDULER_POLL_FD))
+ return -EINVAL;
+
+ event = calloc(1, sizeof(event_t));
+ if (!event)
+ return -ENOMEM;
+
+ gettimeofday(&now, NULL);
+
+ INIT_LIST_HEAD(&event->next);
+
+ event->mode = mode;
+ event->fd = fd;
+ event->timeout = timeout;
+ event->deadline = now.tv_sec + timeout;
+ event->cb = cb;
+ event->private = private;
+ event->id = s->uuid++;
+
+ if (!s->uuid)
+ s->uuid++;
+
+ list_add_tail(&event->next, &s->events);
+
+ return event->id;
+}
+
+void
+scheduler_unregister_event(scheduler_t *s, event_id_t id)
+{
+ event_t *event, *tmp;
+
+ if (!id)
+ return;
+
+ scheduler_for_each_event(s, event, tmp)
+ if (event->id == id) {
+ list_del(&event->next);
+ free(event);
+ s->restart = 1;
+ break;
+ }
+}
+
+void
+scheduler_set_max_timeout(scheduler_t *s, int timeout)
+{
+ if (timeout >= 0)
+ s->max_timeout = MIN(s->max_timeout, timeout);
+}
+
+int
+scheduler_wait_for_events(scheduler_t *s)
+{
+ int ret;
+ struct timeval tv;
+
+ scheduler_prepare_events(s);
+
+ tv.tv_sec = s->timeout;
+ tv.tv_usec = 0;
+
+ DBG("timeout: %d, max_timeout: %d\n",
+ s->timeout, s->max_timeout);
+
+ ret = select(s->max_fd + 1, &s->read_fds,
+ &s->write_fds, &s->except_fds, &tv);
+
+ s->restart = 0;
+ s->timeout = SCHEDULER_MAX_TIMEOUT;
+ s->max_timeout = SCHEDULER_MAX_TIMEOUT;
+
+ if (ret < 0)
+ return ret;
+
+ scheduler_run_events(s);
+
+ return ret;
+}
+
+void
+scheduler_initialize(scheduler_t *s)
+{
+ memset(s, 0, sizeof(scheduler_t));
+
+ s->uuid = 1;
+
+ FD_ZERO(&s->read_fds);
+ FD_ZERO(&s->write_fds);
+ FD_ZERO(&s->except_fds);
+
+ INIT_LIST_HEAD(&s->events);
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _SCHEDULER_H_
+#define _SCHEDULER_H_
+
+#include <sys/select.h>
+
+#include "list.h"
+
+#define SCHEDULER_POLL_READ_FD 0x1
+#define SCHEDULER_POLL_WRITE_FD 0x2
+#define SCHEDULER_POLL_EXCEPT_FD 0x4
+#define SCHEDULER_POLL_TIMEOUT 0x8
+
+typedef int event_id_t;
+typedef void (*event_cb_t) (event_id_t id, char mode, void *private);
+
+typedef struct scheduler {
+ fd_set read_fds;
+ fd_set write_fds;
+ fd_set except_fds;
+
+ struct list_head events;
+
+ int uuid;
+ int max_fd;
+ int timeout;
+ int restart;
+ int max_timeout;
+} scheduler_t;
+
+void scheduler_initialize(scheduler_t *);
+event_id_t scheduler_register_event(scheduler_t *, char mode,
+ int fd, int timeout,
+ event_cb_t cb, void *private);
+void scheduler_unregister_event(scheduler_t *, event_id_t);
+void scheduler_set_max_timeout(scheduler_t *, int);
+int scheduler_wait_for_events(scheduler_t *);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* client harness for tapdisk log */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "log.h"
+
+#define BDPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a)
+
+#define BWPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a)
+
+struct writelog {
+ char* shmpath;
+ uint32_t shmsize;
+ void* shm;
+
+ /* next unprocessed item in the writelog */
+ void* cur;
+ unsigned int inflight;
+
+ /* pointer to start and end of free data space for requests */
+ void* dhd;
+ void* dtl;
+
+ log_sring_t* sring;
+ log_front_ring_t fring;
+};
+
+/* bytes free on the data ring */
+static inline unsigned int dring_avail(struct writelog* wl)
+{
+ /* one byte reserved to distinguish empty from full */
+ if (wl->dhd == wl->dtl)
+ return sdataend(wl->shm) - sdatastart(wl->shm) - 1;
+
+ if (wl->dhd < wl->dtl)
+ return wl->dtl - wl->dhd - 1;
+
+ return (sdataend(wl->shm) - wl->dhd) + (wl->dtl - sdatastart(wl->shm)) - 1;
+}
+
+/* advance ring pointer by len bytes */
+static inline void* dring_advance(struct writelog* wl, void* start, size_t len)
+{
+ void* next;
+ int dsz = sdataend(wl->shm) - sdatastart(wl->shm);
+
+ next = start + (len % dsz);
+ if (next > sdataend(wl->shm))
+ next -= dsz;
+
+ return next;
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "usage: tapdisk-client <sock>\n");
+}
+
+/* returns socket file descriptor */
+static int tdctl_open(const char* sockpath)
+{
+ struct sockaddr_un saddr;
+ int fd;
+
+ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ BWPRINTF("error creating socket: %s", strerror(errno));
+ return -1;
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sun_family = AF_UNIX;
+ memcpy(saddr.sun_path, sockpath, strlen(sockpath));
+
+ if (connect(fd, (struct sockaddr *)&saddr, sizeof(saddr)) < 0) {
+ BWPRINTF("error connecting to socket %s: %s", sockpath, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int ctl_talk(int fd, struct log_ctlmsg* msg, char* rsp, int rsplen)
+{
+ int rc;
+
+ if ((rc = write(fd, msg, sizeof(*msg))) < 0) {
+ BWPRINTF("error sending ctl request: %s", strerror(errno));
+ return -1;
+ } else if (rc < sizeof(*msg)) {
+ BWPRINTF("short ctl write (%d/%zd bytes)", rc, sizeof(*msg));
+ return -1;
+ }
+
+ if (!rsplen)
+ return 0;
+
+ if ((rc = read(fd, rsp, rsplen)) < 0) {
+ BWPRINTF("error reading ctl response: %s", strerror(errno));
+ return -1;
+ } else if (rc < rsplen) {
+ BWPRINTF("short ctl read (%d/%d bytes)", rc, rsplen);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_get_shmem(int fd, struct writelog* wl)
+{
+ struct log_ctlmsg req;
+ char rsp[CTLRSPLEN_SHMP + 1];
+ int rc;
+
+ memset(&req, 0, sizeof(req));
+ memset(rsp, 0, sizeof(rsp));
+
+ memcpy(req.msg, LOGCMD_SHMP, 4);
+ if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_SHMP)) < 0) {
+ BWPRINTF("error getting shared memory parameters");
+ return -1;
+ }
+
+ memcpy(&wl->shmsize, rsp, sizeof(wl->shmsize));
+ wl->shmpath = strdup(rsp + sizeof(wl->shmsize));
+
+ BDPRINTF("shared memory parameters: size: %u, path: %s",
+ wl->shmsize, wl->shmpath);
+
+ return 0;
+}
+
+static void ctlmsg_init(struct log_ctlmsg* msg, const char* cmd)
+{
+ memset(msg, 0, sizeof(*msg));
+ memcpy(msg->msg, cmd, 4);
+}
+
+static int ctl_get_writes(int fd)
+{
+ struct log_ctlmsg req;
+ char rsp[CTLRSPLEN_GET];
+ int rc;
+
+ ctlmsg_init(&req, LOGCMD_GET);
+
+ if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_GET)) < 0) {
+ BWPRINTF("error getting writes");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_peek_writes(int fd)
+{
+ struct log_ctlmsg req;
+ char rsp[CTLRSPLEN_PEEK];
+ int rc;
+
+ ctlmsg_init(&req, LOGCMD_PEEK);
+
+ if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_PEEK)) < 0) {
+ BWPRINTF("error peeking writes");
+ return -1;
+ }
+
+ return 0;
+}
+
+/* submit pending requests */
+static int ctl_kick(int fd)
+{
+ struct log_ctlmsg req;
+ int rc;
+
+ ctlmsg_init(&req, LOGCMD_KICK);
+
+ if ((rc = ctl_talk(fd, &req, NULL, 0)) < 0) {
+ BWPRINTF("error kicking ring");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_clear_writes(int fd)
+{
+ struct log_ctlmsg req;
+ char rsp[CTLRSPLEN_CLEAR];
+ int rc;
+
+ ctlmsg_init(&req, LOGCMD_CLEAR);
+
+ if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_CLEAR)) < 0) {
+ BWPRINTF("error clearing writes");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int writelog_map(struct writelog* wl)
+{
+ int fd;
+ void* shm;
+
+ if ((fd = shm_open(wl->shmpath, O_RDWR, 0750)) < 0) {
+ BWPRINTF("could not open shared memory at %s: %s", wl->shmpath,
+ strerror(errno));
+ return -1;
+ }
+
+ wl->shm = mmap(NULL, wl->shmsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (wl->shm == MAP_FAILED) {
+ BWPRINTF("could not mmap write log shm: %s", strerror(errno));
+ return -1;
+ }
+ wl->cur = wl->shm;
+ wl->inflight = 0;
+ wl->dhd = wl->dtl = sdatastart(wl->shm);
+
+ BDPRINTF("shm cookie: 0x%x, data size: %u", *((uint32_t*)wl->shm),
+ dring_avail(wl));
+
+ wl->sring = sringstart(wl->shm);
+ /* need some thought about what to do on reconnect */
+ FRONT_RING_INIT(&wl->fring, wl->sring, SRINGSIZE);
+
+ return 0;
+}
+
+static int writelog_dump(struct writelog* wl)
+{
+ struct disk_range* range = wl->shm;
+
+ for (range = wl->shm; (void*)range < bmend(wl->shm); range++) {
+ if (!range->count)
+ break;
+
+ BDPRINTF("dirty extent: %"PRIu64":%u",
+ range->sector, range->count);
+ }
+
+ return 0;
+}
+
+/* walk dirty map and enqueue read requests.
+ * returns: 0 when entire bitmap has been enqueued,
+ * 1 when the ring is full
+ * -1 on error
+ */
+static int writelog_enqueue_requests(struct writelog* wl)
+{
+ struct disk_range* range = wl->shm;
+ log_request_t* req;
+
+ for (range = wl->cur; (void*)range < bmend(wl->shm); range++) {
+ if (!range->count)
+ break;
+
+ if (RING_FULL(&wl->fring))
+ break;
+
+ /* insert range into request stream */
+ /* 1. get next request slot from ring */
+ /* 2. ensure enough shm space is available */
+
+ BDPRINTF("enqueueing dirty extent: %"PRIu64":%u (ring space: %d/%d)",
+ range->sector, range->count, RING_FREE_REQUESTS(&wl->fring),
+ RING_SIZE(&wl->fring));
+
+ req = RING_GET_REQUEST(&wl->fring, wl->fring.req_prod_pvt);
+
+ req->sector = range->sector;
+ req->count = range->count;
+ /* ... */
+ req->offset = 0;
+
+ wl->fring.req_prod_pvt++;
+ wl->inflight++;
+ }
+
+ wl->cur = range;
+
+ if (range->count)
+ return 1;
+
+ return 0;
+}
+
+static int writelog_dequeue_responses(struct writelog* wl)
+{
+ RING_IDX rstart, rend;
+ log_response_t rsp;
+
+ rstart = wl->fring.rsp_cons;
+ rend = wl->sring->rsp_prod;
+
+ BDPRINTF("ring kicked (start = %u, end = %u)", rstart, rend);
+
+ while (rstart != rend) {
+ memcpy(&rsp, RING_GET_RESPONSE(&wl->fring, rstart), sizeof(rsp));
+ BDPRINTF("ctl: read response %"PRIu64":%u", rsp.sector, rsp.count);
+ wl->fring.rsp_cons = ++rstart;
+ wl->inflight--;
+ }
+
+ return 0;
+}
+
+static int writelog_free(struct writelog* wl)
+{
+ if (wl->shmpath) {
+ free(wl->shmpath);
+ wl->shmpath = NULL;
+ }
+ if (wl->shm) {
+ munmap(wl->shm, wl->shmsize);
+ wl->shm = NULL;
+ }
+
+ return 0;
+}
+
+int get_writes(struct writelog* wl, int fd, int peek)
+{
+ int rc;
+
+ if (peek)
+ rc = ctl_peek_writes(fd);
+ else
+ rc = ctl_get_writes(fd);
+
+ if (rc < 0)
+ return rc;
+
+ wl->cur = wl->shm;
+
+ return 0;
+}
+
+int await_responses(struct writelog* wl, int fd)
+{
+ struct log_ctlmsg msg;
+ int rc;
+
+ /* sit on socket waiting for kick */
+ if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
+ BWPRINTF("error reading from control socket: %s", strerror(errno));
+ return -1;
+ } else if (!rc) {
+ BWPRINTF("EOF on control socket");
+ return -1;
+ } else if (rc < sizeof(msg)) {
+ BWPRINTF("short reply (%d/%d bytes)", rc, (int) sizeof(msg));
+ return -1;
+ }
+
+ if (strncmp(msg.msg, LOGCMD_KICK, 4)) {
+ BWPRINTF("Unknown message received: %.4s", msg.msg);
+ return -1;
+ }
+
+ if (writelog_dequeue_responses(wl) < 0)
+ return -1;
+
+ return 0;
+}
+
+/* read_loop:
+ * 1. extract dirty bitmap
+ * 2. feed as much as possible onto ring
+ * 3. kick
+ * 4. as responses come back, feed more of the dirty bitmap
+ * into the ring
+ * 5. when entire bitmap has been queued, go to 1?
+ */
+int read_loop(struct writelog* wl, int fd)
+{
+ int rc;
+
+ if (get_writes(wl, fd, 1) < 0)
+ return -1;
+ writelog_dump(wl);
+
+ do {
+ rc = writelog_enqueue_requests(wl);
+
+ if (RING_FREE_REQUESTS(&wl->fring) < RING_SIZE(&wl->fring))
+ RING_PUSH_REQUESTS(&wl->fring);
+ if (ctl_kick(fd) < 0)
+ return -1;
+
+ /* collect responses */
+ if (wl->inflight && await_responses(wl, fd) < 0)
+ return -1;
+ } while (rc > 0);
+
+ return rc;
+}
+
+int main(int argc, char* argv[])
+{
+ int fd;
+ struct writelog wl;
+ char cmd;
+
+ if (argc < 2) {
+ usage();
+ return 1;
+ }
+
+ if (argc < 3)
+ cmd = 'p';
+ else
+ cmd = argv[2][0];
+
+ fd = tdctl_open(argv[1]);
+
+ if (ctl_get_shmem(fd, &wl) < 0)
+ return 1;
+
+ if (writelog_map(&wl) < 0) {
+ BWPRINTF("Error mapping write log: %s", strerror(errno));
+ return 1;
+ }
+
+ switch (cmd) {
+ case 'p':
+ if (get_writes(&wl, fd, 1) < 0)
+ return 1;
+ writelog_dump(&wl);
+ break;
+ case 'c':
+ if (ctl_clear_writes(fd) < 0)
+ return 1;
+ break;
+ case 'g':
+ if (get_writes(&wl, fd, 0) < 0)
+ return 1;
+ writelog_dump(&wl);
+ break;
+ case 'r':
+ if (read_loop(&wl, fd) < 0)
+ return 1;
+ break;
+ default:
+ usage();
+ return 1;
+ }
+
+ writelog_free(&wl);
+ close(fd);
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include "list.h"
+#include "tapdisk.h"
+#include "blktap2.h"
+#include "blktaplib.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+#include "tapdisk-message.h"
+#include "tapdisk-disktype.h"
+
+struct tapdisk_control {
+ char *path;
+ int socket;
+ int event_id;
+};
+
+struct tapdisk_control_connection {
+ int socket;
+ event_id_t event_id;
+};
+
+static struct tapdisk_control td_control;
+
+static void
+tapdisk_control_initialize(void)
+{
+ td_control.socket = -1;
+ td_control.event_id = -1;
+
+ signal(SIGPIPE, SIG_IGN);
+}
+
+void
+tapdisk_control_close(void)
+{
+ if (td_control.path) {
+ unlink(td_control.path);
+ free(td_control.path);
+ td_control.path = NULL;
+ }
+
+ if (td_control.socket != -1) {
+ close(td_control.socket);
+ td_control.socket = -1;
+ }
+}
+
+static struct tapdisk_control_connection *
+tapdisk_control_allocate_connection(int fd)
+{
+ struct tapdisk_control_connection *connection;
+ size_t sz;
+
+ connection = calloc(1, sizeof(*connection));
+ if (!connection) {
+ EPRINTF("calloc");
+ return NULL;
+ }
+
+ connection->socket = fd;
+ return connection;
+}
+
+static void
+tapdisk_control_close_connection(struct tapdisk_control_connection *connection)
+{
+ tapdisk_server_unregister_event(connection->event_id);
+ close(connection->socket);
+ free(connection);
+}
+
+static int
+tapdisk_control_read_message(int fd, tapdisk_message_t *message, int timeout)
+{
+ fd_set readfds;
+ int ret, len, offset;
+ struct timeval tv, *t;
+
+ t = NULL;
+ offset = 0;
+ len = sizeof(tapdisk_message_t);
+
+ if (timeout) {
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ t = &tv;
+ }
+
+ memset(message, 0, sizeof(tapdisk_message_t));
+
+ while (offset < len) {
+ FD_ZERO(&readfds);
+ FD_SET(fd, &readfds);
+
+ ret = select(fd + 1, &readfds, NULL, NULL, t);
+ if (ret == -1)
+ break;
+ else if (FD_ISSET(fd, &readfds)) {
+ ret = read(fd, message + offset, len - offset);
+ if (ret <= 0)
+ break;
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len) {
+ EPRINTF("failure reading message (wanted %d but got %d)\n",
+ len, offset);
+ return -EIO;
+ }
+
+ DPRINTF("received '%s' message (uuid = %u)\n",
+ tapdisk_message_name(message->type), message->cookie);
+
+ return 0;
+}
+
+static int
+tapdisk_control_write_message(int fd, tapdisk_message_t *message, int timeout)
+{
+ fd_set writefds;
+ int ret, len, offset;
+ struct timeval tv, *t;
+
+ t = NULL;
+ offset = 0;
+ len = sizeof(tapdisk_message_t);
+
+ if (timeout) {
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ t = &tv;
+ }
+
+ DPRINTF("sending '%s' message (uuid = %u)\n",
+ tapdisk_message_name(message->type), message->cookie);
+
+ while (offset < len) {
+ FD_ZERO(&writefds);
+ FD_SET(fd, &writefds);
+
+ /* we don't bother reinitializing tv. at worst, it will wait a
+ * bit more time than expected. */
+
+ ret = select(fd + 1, NULL, &writefds, NULL, t);
+ if (ret == -1)
+ break;
+ else if (FD_ISSET(fd, &writefds)) {
+ ret = write(fd, message + offset, len - offset);
+ if (ret <= 0)
+ break;
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len) {
+ EPRINTF("failure writing message\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_control_validate_request(tapdisk_message_t *request)
+{
+ if (strnlen(request->u.params.path,
+ TAPDISK_MESSAGE_MAX_PATH_LENGTH) >=
+ TAPDISK_MESSAGE_MAX_PATH_LENGTH)
+ return EINVAL;
+
+ return 0;
+}
+
+static void
+tapdisk_control_list_minors(struct tapdisk_control_connection *connection,
+ tapdisk_message_t *request)
+{
+ int i;
+ td_vbd_t *vbd;
+ struct list_head *head;
+ tapdisk_message_t response;
+
+ i = 0;
+ memset(&response, 0, sizeof(response));
+
+ response.type = TAPDISK_MESSAGE_LIST_MINORS_RSP;
+ response.cookie = request->cookie;
+
+ head = tapdisk_server_get_all_vbds();
+
+ list_for_each_entry(vbd, head, next) {
+ response.u.minors.list[i++] = vbd->minor;
+ if (i >= TAPDISK_MESSAGE_MAX_MINORS) {
+ response.type = TAPDISK_MESSAGE_ERROR;
+ response.u.response.error = ERANGE;
+ break;
+ }
+ }
+
+ response.u.minors.count = i;
+ tapdisk_control_write_message(connection->socket, &response, 2);
+ tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_list(struct tapdisk_control_connection *connection,
+ tapdisk_message_t *request)
+{
+ td_vbd_t *vbd;
+ struct list_head *head;
+ tapdisk_message_t response;
+ int count, i;
+
+ memset(&response, 0, sizeof(response));
+ response.type = TAPDISK_MESSAGE_LIST_RSP;
+ response.cookie = request->cookie;
+
+ head = tapdisk_server_get_all_vbds();
+
+ count = 0;
+ list_for_each_entry(vbd, head, next)
+ count++;
+
+ list_for_each_entry(vbd, head, next) {
+ response.u.list.count = count--;
+ response.u.list.minor = vbd->minor;
+ response.u.list.state = vbd->state;
+ response.u.list.path[0] = 0;
+
+ if (!list_empty(&vbd->images)) {
+ td_image_t *image = list_entry(vbd->images.next,
+ td_image_t, next);
+ snprintf(response.u.list.path,
+ sizeof(response.u.list.path),
+ "%s:%s",
+ tapdisk_disk_types[image->type]->name,
+ image->name);
+ }
+
+ tapdisk_control_write_message(connection->socket, &response, 2);
+ }
+
+ response.u.list.count = count;
+ response.u.list.minor = -1;
+ response.u.list.path[0] = 0;
+
+ tapdisk_control_write_message(connection->socket, &response, 2);
+ tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_get_pid(struct tapdisk_control_connection *connection,
+ tapdisk_message_t *request)
+{
+ tapdisk_message_t response;
+
+ memset(&response, 0, sizeof(response));
+ response.type = TAPDISK_MESSAGE_PID_RSP;
+ response.cookie = request->cookie;
+ response.u.tapdisk_pid = getpid();
+
+ tapdisk_control_write_message(connection->socket, &response, 2);
+ tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_attach_vbd(struct tapdisk_control_connection *connection,
+ tapdisk_message_t *request)
+{
+ tapdisk_message_t response;
+ char *devname;
+ td_vbd_t *vbd;
+ struct blktap2_params params;
+ image_t image;
+ int minor, err;
+
+ /*
+ * TODO: check for max vbds per process
+ */
+
+ vbd = tapdisk_server_get_vbd(request->cookie);
+ if (vbd) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ minor = request->cookie;
+ if (minor < 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ vbd = tapdisk_vbd_create(minor);
+ if (!vbd) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = asprintf(&devname, BLKTAP2_RING_DEVICE"%d", minor);
+ if (err == -1) {
+ err = -ENOMEM;
+ goto fail_vbd;
+ }
+
+ err = tapdisk_vbd_attach(vbd, devname, minor);
+ free(devname);
+ if (err)
+ goto fail_vbd;
+
+ tapdisk_server_add_vbd(vbd);
+
+out:
+ memset(&response, 0, sizeof(response));
+ response.type = TAPDISK_MESSAGE_ATTACH_RSP;
+ response.cookie = request->cookie;
+ response.u.response.error = -err;
+
+ tapdisk_control_write_message(connection->socket, &response, 2);
+ tapdisk_control_close_connection(connection);
+
+ return;
+
+fail_vbd:
+ tapdisk_vbd_detach(vbd);
+ free(vbd);
+ goto out;
+}
+
+
+static void
+tapdisk_control_detach_vbd(struct tapdisk_control_connection *connection,
+ tapdisk_message_t *request)
+{
+ tapdisk_message_t response;
+ td_vbd_t *vbd;
+ int err;
+
+ vbd = tapdisk_server_get_vbd(request->cookie);
+ if (!vbd) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ tapdisk_vbd_detach(vbd);
+
+ if (list_empty(&vbd->images)) {
+ tapdisk_server_remove_vbd(vbd);
+ free(vbd);
+ }
+
+ err = 0;
+out:
+ memset(&response, 0, sizeof(response));
+ response.type = TAPDISK_MESSAGE_DETACH_RSP;
+ response.cookie = request->cookie;
+ response.u.response.error = -err;
+
+ tapdisk_control_write_message(connection->socket, &response, 2);
+ tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_open_image(struct tapdisk_control_connection *connection,
+ tapdisk_message_t *request)
+{
+ int err;
+ image_t image;
+ td_vbd_t *vbd;
+ td_flag_t flags;
+ tapdisk_message_t response;
+ struct blktap2_params params;
+
+ vbd = tapdisk_server_get_vbd(request->cookie);
+ if (!vbd) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (vbd->minor == -1) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (vbd->name) {
+ err = -EALREADY;
+ goto out;
+ }
+
+ flags = 0;
+ if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_RDONLY)
+ flags |= TD_OPEN_RDONLY;
+ if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_SHARED)
+ flags |= TD_OPEN_SHAREABLE;
+ if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_ADD_CACHE)
+ flags |= TD_OPEN_ADD_CACHE;
+ if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_VHD_INDEX)
+ flags |= TD_OPEN_VHD_INDEX;
+ if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_LOG_DIRTY)
+ flags |= TD_OPEN_LOG_DIRTY;
+
+ vbd->name = strndup(request->u.params.path,
+ sizeof(request->u.params.path));
+ if (!vbd->name) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = tapdisk_vbd_parse_stack(vbd, request->u.params.path);
+ if (err)
+ goto out;
+
+ err = tapdisk_vbd_open_stack(vbd, request->u.params.storage, flags);
+ if (err)
+ goto out;
+
+ err = tapdisk_vbd_get_image_info(vbd, &image);
+ if (err)
+ goto fail_close;
+
+ params.capacity = image.size;
+ params.sector_size = image.secsize;
+ strncpy(params.name, vbd->name, BLKTAP2_MAX_MESSAGE_LEN);
+
+ err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_CREATE_DEVICE, ¶ms);
+ if (err && errno != EEXIST) {
+ err = -errno;
+ EPRINTF("create device failed: %d\n", err);
+ goto fail_close;
+ }
+
+ err = 0;
+
+out:
+ memset(&response, 0, sizeof(response));
+ response.cookie = request->cookie;
+
+ if (err) {
+ response.type = TAPDISK_MESSAGE_ERROR;
+ response.u.response.error = -err;
+ } else {
+ response.u.image.sectors = image.size;
+ response.u.image.sector_size = image.secsize;
+ response.u.image.info = image.info;
+ response.type = TAPDISK_MESSAGE_OPEN_RSP;
+ }
+
+ tapdisk_control_write_message(connection->socket, &response, 2);
+ tapdisk_control_close_connection(connection);
+
+ return;
+
+fail_close:
+ tapdisk_vbd_close_vdi(vbd);
+ free(vbd->name);
+ vbd->name = NULL;
+ goto out;
+}
+
+static void
+tapdisk_control_close_image(struct tapdisk_control_connection *connection,
+ tapdisk_message_t *request)
+{
+ tapdisk_message_t response;
+ td_vbd_t *vbd;
+ int err;
+
+ vbd = tapdisk_server_get_vbd(request->cookie);
+ if (!vbd) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!list_empty(&vbd->pending_requests)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ tapdisk_vbd_close_vdi(vbd);
+
+ /* NB. vbd->name free should probably belong into close_vdi,
+ but the current blktap1 reopen-stuff likely depends on a
+ lifetime extended until shutdown. */
+ free(vbd->name);
+ vbd->name = NULL;
+
+ if (vbd->minor == -1) {
+ tapdisk_server_remove_vbd(vbd);
+ tapdisk_vbd_free(vbd);
+ }
+
+ err = 0;
+out:
+ memset(&response, 0, sizeof(response));
+ response.type = TAPDISK_MESSAGE_CLOSE_RSP;
+ response.cookie = request->cookie;
+ response.u.response.error = -err;
+
+ tapdisk_control_write_message(connection->socket, &response, 2);
+ tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_pause_vbd(struct tapdisk_control_connection *connection,
+ tapdisk_message_t *request)
+{
+ int err;
+ td_vbd_t *vbd;
+ tapdisk_message_t response;
+
+ memset(&response, 0, sizeof(response));
+
+ response.type = TAPDISK_MESSAGE_PAUSE_RSP;
+
+ vbd = tapdisk_server_get_vbd(request->cookie);
+ if (!vbd) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ do {
+ err = tapdisk_vbd_pause(vbd);
+
+ if (!err || err != -EAGAIN)
+ break;
+
+ tapdisk_server_iterate();
+ } while (1);
+
+out:
+ response.cookie = request->cookie;
+ response.u.response.error = -err;
+ tapdisk_control_write_message(connection->socket, &response, 2);
+ tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_resume_vbd(struct tapdisk_control_connection *connection,
+ tapdisk_message_t *request)
+{
+ int err;
+ td_vbd_t *vbd;
+ tapdisk_message_t response;
+
+ memset(&response, 0, sizeof(response));
+
+ response.type = TAPDISK_MESSAGE_RESUME_RSP;
+
+ vbd = tapdisk_server_get_vbd(request->cookie);
+ if (!vbd) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (request->u.params.path[0]) {
+ free(vbd->name);
+ vbd->name = strndup(request->u.params.path,
+ sizeof(request->u.params.path));
+ if (!vbd->name) {
+ err = -ENOMEM;
+ goto out;
+ }
+ } else if (!vbd->name) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = tapdisk_vbd_parse_stack(vbd, vbd->name);
+ if (err)
+ goto out;
+
+ err = tapdisk_vbd_resume(vbd, NULL, -1);
+ if (err)
+ goto out;
+
+out:
+ response.cookie = request->cookie;
+ response.u.response.error = -err;
+ tapdisk_control_write_message(connection->socket, &response, 2);
+ tapdisk_control_close_connection(connection);
+}
+
+static void
+tapdisk_control_handle_request(event_id_t id, char mode, void *private)
+{
+ int err;
+ tapdisk_message_t message;
+ struct tapdisk_control_connection *connection =
+ (struct tapdisk_control_connection *)private;
+
+ if (tapdisk_control_read_message(connection->socket, &message, 2)) {
+ EPRINTF("failed to read message from %d\n", connection->socket);
+ tapdisk_control_close_connection(connection);
+ return;
+ }
+
+ err = tapdisk_control_validate_request(&message);
+ if (err)
+ goto fail;
+
+ switch (message.type) {
+ case TAPDISK_MESSAGE_PID:
+ return tapdisk_control_get_pid(connection, &message);
+ case TAPDISK_MESSAGE_LIST_MINORS:
+ return tapdisk_control_list_minors(connection, &message);
+ case TAPDISK_MESSAGE_LIST:
+ return tapdisk_control_list(connection, &message);
+ case TAPDISK_MESSAGE_ATTACH:
+ return tapdisk_control_attach_vbd(connection, &message);
+ case TAPDISK_MESSAGE_DETACH:
+ return tapdisk_control_detach_vbd(connection, &message);
+ case TAPDISK_MESSAGE_OPEN:
+ return tapdisk_control_open_image(connection, &message);
+ case TAPDISK_MESSAGE_PAUSE:
+ return tapdisk_control_pause_vbd(connection, &message);
+ case TAPDISK_MESSAGE_RESUME:
+ return tapdisk_control_resume_vbd(connection, &message);
+ case TAPDISK_MESSAGE_CLOSE:
+ return tapdisk_control_close_image(connection, &message);
+ default: {
+ tapdisk_message_t response;
+ fail:
+
+ EPRINTF("received unsupported message '%s'\n",
+ tapdisk_message_name(message.type));
+
+ memset(&response, 0, sizeof(response));
+
+ response.type = TAPDISK_MESSAGE_ERROR;
+ response.u.response.error = (err ? -err : EINVAL);
+ tapdisk_control_write_message(connection->socket, &response, 2);
+
+ tapdisk_control_close_connection(connection);
+ break;
+ }
+ }
+}
+
+static void
+tapdisk_control_accept(event_id_t id, char mode, void *private)
+{
+ int err, fd;
+ struct tapdisk_control_connection *connection;
+
+ fd = accept(td_control.socket, NULL, NULL);
+ if (fd == -1) {
+ EPRINTF("failed to accept new control connection: %d\n", errno);
+ return;
+ }
+
+ connection = tapdisk_control_allocate_connection(fd);
+ if (!connection) {
+ close(fd);
+ EPRINTF("failed to allocate new control connection\n");
+ }
+
+ err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ connection->socket, 0,
+ tapdisk_control_handle_request,
+ connection);
+ if (err == -1) {
+ close(fd);
+ free(connection);
+ EPRINTF("failed to register new control event: %d\n", err);
+ }
+
+ connection->event_id = err;
+}
+
+static int
+tapdisk_control_mkdir(const char *dir)
+{
+ int err;
+ char *ptr, *name, *start;
+
+ err = access(dir, W_OK | R_OK);
+ if (!err)
+ return 0;
+
+ name = strdup(dir);
+ if (!name)
+ return -ENOMEM;
+
+ start = name;
+
+ for (;;) {
+ ptr = strchr(start + 1, '/');
+ if (ptr)
+ *ptr = '\0';
+
+ err = mkdir(name, 0755);
+ if (err && errno != EEXIST) {
+ err = -errno;
+ EPRINTF("failed to create directory %s: %d\n",
+ name, err);
+ break;
+ }
+
+ if (!ptr)
+ break;
+ else {
+ *ptr = '/';
+ start = ptr + 1;
+ }
+ }
+
+ free(name);
+ return err;
+}
+
+static int
+tapdisk_control_create_socket(char **socket_path)
+{
+ int err, flags;
+ struct sockaddr_un saddr;
+
+ err = tapdisk_control_mkdir(BLKTAP2_CONTROL_DIR);
+ if (err) {
+ EPRINTF("failed to create directory %s: %d\n",
+ BLKTAP2_CONTROL_DIR, err);
+ return err;
+ }
+
+ err = asprintf(&td_control.path, "%s/%s%d",
+ BLKTAP2_CONTROL_DIR, BLKTAP2_CONTROL_SOCKET, getpid());
+ if (err == -1) {
+ td_control.path = NULL;
+ err = (errno ? : ENOMEM);
+ goto fail;
+ }
+
+ if (unlink(td_control.path) && errno != ENOENT) {
+ err = errno;
+ EPRINTF("failed to unlink %s: %d\n", td_control.path, errno);
+ goto fail;
+ }
+
+ td_control.socket = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (td_control.socket == -1) {
+ err = errno;
+ EPRINTF("failed to create control socket: %d\n", err);
+ goto fail;
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ strncpy(saddr.sun_path, td_control.path, sizeof(saddr.sun_path));
+ saddr.sun_family = AF_UNIX;
+
+ err = bind(td_control.socket,
+ (const struct sockaddr *)&saddr, sizeof(saddr));
+ if (err == -1) {
+ err = errno;
+ EPRINTF("failed to bind to %s: %d\n", saddr.sun_path, err);
+ goto fail;
+ }
+
+ err = listen(td_control.socket, 10);
+ if (err == -1) {
+ err = errno;
+ EPRINTF("failed to listen: %d\n", err);
+ goto fail;
+ }
+
+ err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ td_control.socket, 0,
+ tapdisk_control_accept, NULL);
+ if (err < 0) {
+ EPRINTF("failed to add watch: %d\n", err);
+ goto fail;
+ }
+
+ td_control.event_id = err;
+ *socket_path = td_control.path;
+
+ return 0;
+
+fail:
+ tapdisk_control_close();
+ return err;
+}
+
+int
+tapdisk_control_open(char **path)
+{
+ int err;
+
+ tapdisk_control_initialize();
+
+ return tapdisk_control_create_socket(path);
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __TAPDISK_CONTROL_H__
+#define __TAPDISK_CONTROL_H__
+
+int tapdisk_control_open(char **path);
+void tapdisk_control_close(void);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <libgen.h> /* for basename(3) */
+#include <unistd.h>
+
+#include "list.h"
+#include "scheduler.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+#include "tapdisk-disktype.h"
+#include "tapdisk-utils.h"
+#include "libvhd.h"
+
+#define POLL_READ 0
+#define POLL_WRITE 1
+
+#define SPB_SHIFT (VHD_BLOCK_SHIFT - SECTOR_SHIFT)
+
+struct tapdisk_stream_poll {
+ int pipe[2];
+ int set;
+};
+
+struct tapdisk_stream_request {
+ uint64_t sec;
+ uint32_t secs;
+ uint64_t seqno;
+ blkif_request_t blkif_req;
+ struct list_head next;
+};
+
+struct tapdisk_stream {
+ td_vbd_t *vbd;
+
+ unsigned int id;
+
+ int err;
+
+ uint64_t cur;
+ uint64_t start;
+ uint64_t end;
+
+ uint64_t started;
+ uint64_t completed;
+
+ struct tapdisk_stream_poll poll;
+ event_id_t enqueue_event_id;
+
+ struct list_head free_list;
+ struct list_head pending_list;
+ struct list_head completed_list;
+
+ struct tapdisk_stream_request requests[MAX_REQUESTS];
+};
+
+static unsigned int tapdisk_stream_count;
+
+static void tapdisk_stream_close_image(struct tapdisk_stream *);
+
+static char *program;
+static struct tapdisk_stream stream1, stream2;
+static vhd_context_t vhd1;
+
+static void
+usage(FILE *stream)
+{
+ printf("usage: %s <-n type:/path/to/image> <-m type:/path/to/image>\n",
+ program);
+}
+
+static int
+open_vhd(const char *path, vhd_context_t *vhd)
+{
+ int err;
+
+ err = vhd_open(vhd, path, VHD_OPEN_RDONLY);
+ if (err) {
+ printf("error opening %s: %d\n", path, err);
+ return err;
+ }
+
+ err = vhd_get_bat(vhd);
+ if (err)
+ {
+ printf("error reading BAT for %s: %d\n", path, err);
+ vhd_close(vhd);
+ return err;
+ }
+
+ return 0;
+}
+
+static inline void
+tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p)
+{
+ p->set = 0;
+ p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1;
+}
+
+static int
+tapdisk_stream_poll_open(struct tapdisk_stream_poll *p)
+{
+ int err;
+
+ tapdisk_stream_poll_initialize(p);
+
+ err = pipe(p->pipe);
+ if (err)
+ return -errno;
+
+ err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK);
+ if (err)
+ goto out;
+
+ err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK);
+ if (err)
+ goto out;
+
+ return 0;
+
+out:
+ close(p->pipe[POLL_READ]);
+ close(p->pipe[POLL_WRITE]);
+ tapdisk_stream_poll_initialize(p);
+ return -errno;
+}
+
+static void
+tapdisk_stream_poll_close(struct tapdisk_stream_poll *p)
+{
+ if (p->pipe[POLL_READ] != -1)
+ close(p->pipe[POLL_READ]);
+ if (p->pipe[POLL_WRITE] != -1)
+ close(p->pipe[POLL_WRITE]);
+ tapdisk_stream_poll_initialize(p);
+}
+
+static inline void
+tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p)
+{
+ int dummy;
+
+ read_exact(p->pipe[POLL_READ], &dummy, sizeof(dummy));
+ p->set = 0;
+}
+
+static inline void
+tapdisk_stream_poll_set(struct tapdisk_stream_poll *p)
+{
+ int dummy = 0;
+
+ if (!p->set) {
+ write_exact(p->pipe[POLL_WRITE], &dummy, sizeof(dummy));
+ p->set = 1;
+ }
+}
+
+static inline int
+tapdisk_stream_stop(struct tapdisk_stream *s)
+{
+ return ((s->cur == s->end || s->err) &&
+ list_empty(&s->pending_list) &&
+ list_empty(&s->completed_list));
+}
+
+static inline void
+tapdisk_stream_initialize_request(struct tapdisk_stream_request *req)
+{
+ memset(req, 0, sizeof(*req));
+ INIT_LIST_HEAD(&req->next);
+}
+
+static inline int
+tapdisk_stream_request_idx(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *req)
+{
+ return (req - s->requests);
+}
+
+static inline struct tapdisk_stream_request *
+tapdisk_stream_get_request(struct tapdisk_stream *s)
+{
+ struct tapdisk_stream_request *req;
+
+ if (list_empty(&s->free_list))
+ return NULL;
+
+ req = list_entry(s->free_list.next,
+ struct tapdisk_stream_request, next);
+
+ list_del_init(&req->next);
+ tapdisk_stream_initialize_request(req);
+
+ return req;
+}
+
+static inline void
+tapdisk_stream_queue_completed(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *sreq)
+{
+ struct tapdisk_stream_request *itr;
+
+ list_for_each_entry(itr, &s->completed_list, next)
+ if (sreq->seqno < itr->seqno) {
+ list_add_tail(&sreq->next, &itr->next);
+ return;
+ }
+
+ list_add_tail(&sreq->next, &s->completed_list);
+}
+
+static int
+tapdisk_result_compare(struct tapdisk_stream_request *sreq1,
+ struct tapdisk_stream_request *sreq2)
+{
+ unsigned long idx1, idx2;
+ char *buf1, *buf2;
+ int result;
+
+ assert(sreq1->seqno == sreq2->seqno);
+ assert(sreq1->secs == sreq2->secs);
+ idx1 = (unsigned long)tapdisk_stream_request_idx(&stream1,
+ sreq1);
+ idx2 = (unsigned long)tapdisk_stream_request_idx(&stream2,
+ sreq2);
+ buf1 = (char *)MMAP_VADDR(stream1.vbd->ring.vstart, idx1, 0);
+ buf2 = (char *)MMAP_VADDR(stream2.vbd->ring.vstart, idx2, 0);
+
+ result = memcmp(buf1, buf2, sreq1->secs << SECTOR_SHIFT);
+ return result;
+}
+
+static int
+tapdisk_stream_process_data(void)
+{
+ struct tapdisk_stream_request *sreq1, *sreq2, *tmp1, *tmp2;
+ int advance_both;
+ int result = 0;
+
+ sreq1 = list_entry(stream1.completed_list.next,
+ struct tapdisk_stream_request, next);
+ sreq2 = list_entry(stream2.completed_list.next,
+ struct tapdisk_stream_request, next);
+ tmp1 = list_entry(sreq1->next.next,
+ struct tapdisk_stream_request, next);
+ tmp2 = list_entry(sreq2->next.next,
+ struct tapdisk_stream_request, next);
+ while (result == 0 &&
+ &sreq1->next != &stream1.completed_list &&
+ &sreq2->next != &stream2.completed_list) {
+ //printf("checking: %llu|%llu\n", sreq1->seqno, sreq2->seqno);
+ advance_both = 1;
+ if (sreq1->seqno < sreq2->seqno) {
+ advance_both = 0;
+ goto advance1;
+ }
+ if (sreq1->seqno > sreq2->seqno)
+ goto advance2;
+
+ result = tapdisk_result_compare(sreq1, sreq2);
+
+ stream1.completed++;
+ stream2.completed++;
+
+ list_del_init(&sreq1->next);
+ list_add_tail(&sreq1->next, &stream1.free_list);
+ list_del_init(&sreq2->next);
+ list_add_tail(&sreq2->next, &stream2.free_list);
+
+advance1:
+ sreq1 = tmp1;
+ tmp1 = list_entry(tmp1->next.next,
+ struct tapdisk_stream_request, next);
+ if (!advance_both)
+ continue;
+advance2:
+ sreq2 = tmp2;
+ tmp2 = list_entry(tmp2->next.next,
+ struct tapdisk_stream_request, next);
+ }
+
+ return result;
+}
+
+static void
+tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp)
+{
+ struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+ struct tapdisk_stream_request *sreq = s->requests + rsp->id;
+
+ list_del_init(&sreq->next);
+
+ if (rsp->status == BLKIF_RSP_OKAY)
+ tapdisk_stream_queue_completed(s, sreq);
+ else {
+ s->err = EIO;
+ list_add_tail(&sreq->next, &s->free_list);
+ fprintf(stderr, "error reading sector 0x%"PRIx64"\n", sreq->sec);
+ }
+
+ if (tapdisk_stream_process_data()) {
+ fprintf(stderr, "mismatch at sector 0x%"PRIx64"\n",
+ sreq->sec);
+ stream1.err = EINVAL;
+ stream2.err = EINVAL;
+ }
+
+ tapdisk_stream_poll_set(&stream1.poll);
+ tapdisk_stream_poll_set(&stream2.poll);
+}
+
+static inline int
+tapdisk_stream_enqueue_copy(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *r)
+{
+ td_vbd_t *vbd;
+ blkif_request_t *breq;
+ td_vbd_request_t *vreq;
+ struct tapdisk_stream_request *sreq;
+ int idx;
+
+ vbd = stream2.vbd;
+ sreq = tapdisk_stream_get_request(s);
+ if (!sreq)
+ return 1;
+
+ idx = tapdisk_stream_request_idx(s, sreq);
+
+ sreq->sec = r->sec;
+ sreq->secs = r->secs;
+ sreq->seqno = r->seqno;
+
+ breq = &sreq->blkif_req;
+ breq->id = idx;
+ breq->nr_segments = r->blkif_req.nr_segments;
+ breq->sector_number = r->blkif_req.sector_number;
+ breq->operation = BLKIF_OP_READ;
+
+ for (int i = 0; i < r->blkif_req.nr_segments; i++) {
+ struct blkif_request_segment *seg = breq->seg + i;
+ seg->first_sect = r->blkif_req.seg[i].first_sect;
+ seg->last_sect = r->blkif_req.seg[i].last_sect;
+ }
+ s->cur += sreq->secs;
+
+ vreq = vbd->request_list + idx;
+ assert(list_empty(&vreq->next));
+ assert(vreq->secs_pending == 0);
+
+ memcpy(&vreq->req, breq, sizeof(*breq));
+ vbd->received++;
+ vreq->vbd = vbd;
+
+ tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+ list_add_tail(&sreq->next, &s->pending_list);
+
+ return 0;
+}
+
+static void
+tapdisk_stream_enqueue1(void)
+{
+ td_vbd_t *vbd;
+ int i, idx, psize, blk;
+ struct tapdisk_stream *s = &stream1;
+
+ vbd = s->vbd;
+ psize = getpagesize();
+
+ while (s->cur < s->end && !s->err) {
+ blkif_request_t *breq;
+ td_vbd_request_t *vreq;
+ struct tapdisk_stream_request *sreq;
+
+ /* skip any blocks that are not present in this image */
+ blk = s->cur >> SPB_SHIFT;
+ while (s->cur < s->end && vhd1.bat.bat[blk] == DD_BLK_UNUSED) {
+ //printf("skipping block %d\n", blk);
+ blk++;
+ s->cur = blk << SPB_SHIFT;
+ }
+
+ if (s->cur >= s->end)
+ break;
+
+ sreq = tapdisk_stream_get_request(s);
+ if (!sreq)
+ break;
+
+ idx = tapdisk_stream_request_idx(s, sreq);
+
+ sreq->sec = s->cur;
+ sreq->secs = 0;
+ sreq->seqno = s->started++;
+
+ breq = &sreq->blkif_req;
+ breq->id = idx;
+ breq->nr_segments = 0;
+ breq->sector_number = sreq->sec;
+ breq->operation = BLKIF_OP_READ;
+
+ for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) {
+ uint32_t secs;
+ struct blkif_request_segment *seg = breq->seg + i;
+
+ secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT);
+ secs = MIN(((blk + 1) << SPB_SHIFT) - s->cur, secs);
+ if (!secs)
+ break;
+
+ sreq->secs += secs;
+ s->cur += secs;
+
+ seg->first_sect = 0;
+ seg->last_sect = secs - 1;
+ breq->nr_segments++;
+ }
+
+ vreq = vbd->request_list + idx;
+
+ assert(list_empty(&vreq->next));
+ assert(vreq->secs_pending == 0);
+
+ memcpy(&vreq->req, breq, sizeof(*breq));
+ vbd->received++;
+ vreq->vbd = vbd;
+
+ tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+ list_add_tail(&sreq->next, &s->pending_list);
+ }
+
+ tapdisk_vbd_issue_requests(vbd);
+}
+
+static void
+tapdisk_stream_enqueue2(void)
+{
+ td_vbd_t *vbd;
+ int i, blk;
+ struct tapdisk_stream_request *itr;
+ struct tapdisk_stream *s = &stream2;
+
+ vbd = s->vbd;
+
+ /* issue the same requests that we issued on stream1 */
+ list_for_each_entry(itr, &stream1.completed_list, next) {
+ if (itr->sec < s->cur)
+ continue;
+ if (tapdisk_stream_enqueue_copy(s, itr))
+ goto done;
+ }
+
+ list_for_each_entry(itr, &stream1.pending_list, next) {
+ if (itr->sec < s->cur)
+ continue;
+ if (tapdisk_stream_enqueue_copy(s, itr))
+ goto done;
+ }
+
+ stream2.cur = stream1.cur;
+
+done:
+ tapdisk_vbd_issue_requests(vbd);
+}
+
+static inline int
+tapdisk_diff_done(void)
+{
+ return (tapdisk_stream_stop(&stream1) && tapdisk_stream_stop(&stream2));
+}
+
+static void
+tapdisk_diff_stop(void)
+{
+ tapdisk_stream_close_image(&stream1);
+ tapdisk_stream_close_image(&stream2);
+}
+
+static void
+tapdisk_stream_enqueue(event_id_t id, char mode, void *arg)
+{
+ struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+
+ tapdisk_stream_poll_clear(&s->poll);
+
+ if (tapdisk_diff_done()) {
+ tapdisk_diff_stop();
+ return;
+ }
+
+ if (s == &stream1)
+ tapdisk_stream_enqueue1();
+ else if (s == &stream2)
+ tapdisk_stream_enqueue2();
+ else
+ assert(0);
+
+ if (tapdisk_diff_done()) {
+ // we have to check again for the case when stream1 had no
+ // blocks at all
+ tapdisk_diff_stop();
+ return;
+ }
+}
+
+static int
+tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type)
+{
+ int err;
+ image_t image;
+
+ s->id = tapdisk_stream_count++;
+
+ err = tapdisk_vbd_initialize(s->id);
+ if (err)
+ goto out;
+
+ s->vbd = tapdisk_server_get_vbd(s->id);
+ if (!s->vbd) {
+ err = ENODEV;
+ goto out;
+ }
+
+ tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s);
+
+ err = tapdisk_vbd_open_vdi(s->vbd, path, type,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ TD_OPEN_RDONLY);
+ if (err)
+ goto out;
+
+ s->vbd->reopened = 1;
+
+ err = tapdisk_vbd_get_image_info(s->vbd, &image);
+ if (err) {
+ fprintf(stderr, "failed getting image size: %d\n", err);
+ return err;
+ }
+
+ s->start = 0;
+ s->cur = s->start;
+ s->end = image.size;
+
+ err = 0;
+
+out:
+ if (err)
+ fprintf(stderr, "failed to open image %s: %d\n", path, err);
+ return err;
+}
+
+static void
+tapdisk_stream_close_image(struct tapdisk_stream *s)
+{
+ td_vbd_t *vbd;
+
+ vbd = tapdisk_server_get_vbd(s->id);
+ if (vbd) {
+ tapdisk_vbd_close_vdi(vbd);
+ tapdisk_server_remove_vbd(vbd);
+ free((void *)vbd->ring.vstart);
+ free(vbd->name);
+ free(vbd);
+ s->vbd = NULL;
+ }
+}
+
+static int
+tapdisk_stream_initialize_requests(struct tapdisk_stream *s)
+{
+ size_t size;
+ td_ring_t *ring;
+ int err, i, psize;
+
+ ring = &s->vbd->ring;
+ psize = getpagesize();
+ size = psize * BLKTAP_MMAP_REGION_SIZE;
+
+ /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */
+ err = posix_memalign((void **)&ring->vstart, psize, size);
+ if (err) {
+ fprintf(stderr, "failed to allocate buffers: %d\n", err);
+ ring->vstart = 0;
+ return err;
+ }
+
+ for (i = 0; i < MAX_REQUESTS; i++) {
+ struct tapdisk_stream_request *req = s->requests + i;
+ tapdisk_stream_initialize_request(req);
+ list_add_tail(&req->next, &s->free_list);
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s)
+{
+ int err;
+ struct tapdisk_stream_poll *p = &s->poll;
+
+ err = tapdisk_stream_poll_open(p);
+ if (err)
+ goto out;
+
+ err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ p->pipe[POLL_READ], 0,
+ tapdisk_stream_enqueue, s);
+ if (err < 0)
+ goto out;
+
+ s->enqueue_event_id = err;
+ err = 0;
+
+out:
+ if (err)
+ fprintf(stderr, "failed to register event: %d\n", err);
+ return err;
+}
+
+static void
+tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s)
+{
+ if (s->enqueue_event_id) {
+ tapdisk_server_unregister_event(s->enqueue_event_id);
+ s->enqueue_event_id = 0;
+ }
+ tapdisk_stream_poll_close(&s->poll);
+}
+
+static inline void
+tapdisk_stream_initialize(struct tapdisk_stream *s)
+{
+ memset(s, 0, sizeof(*s));
+ INIT_LIST_HEAD(&s->free_list);
+ INIT_LIST_HEAD(&s->pending_list);
+ INIT_LIST_HEAD(&s->completed_list);
+}
+
+static int
+tapdisk_stream_open(struct tapdisk_stream *s, const char *arg)
+{
+ int err, type;
+ const char *path;
+
+ type = tapdisk_disktype_parse_params(arg, &path);
+ if (type < 0)
+ return type;
+
+ tapdisk_stream_initialize(s);
+
+ err = tapdisk_stream_open_image(s, path, type);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_initialize_requests(s);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_register_enqueue_event(s);
+ if (err)
+ return err;
+
+ tapdisk_stream_enqueue(s->enqueue_event_id,
+ SCHEDULER_POLL_READ_FD, s);
+
+ return 0;
+}
+
+static void
+tapdisk_stream_release(struct tapdisk_stream *s)
+{
+ tapdisk_stream_close_image(s);
+ tapdisk_stream_unregister_enqueue_event(s);
+}
+
+static int
+tapdisk_stream_run(struct tapdisk_stream *s)
+{
+ tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s);
+ tapdisk_server_run();
+ return s->err;
+}
+
+int
+main(int argc, char *argv[])
+{
+ int c, err, type1;
+ const char *arg1 = NULL, *arg2 = NULL;
+ const disk_info_t *info;
+ const char *path1;
+
+ err = 0;
+
+ program = basename(argv[0]);
+
+ while ((c = getopt(argc, argv, "n:m:h")) != -1) {
+ switch (c) {
+ case 'n':
+ arg1 = optarg;
+ break;
+ case 'm':
+ arg2 = optarg;
+ break;
+ case 'h':
+ usage(stdout);
+ return 0;
+ default:
+ goto fail_usage;
+ }
+ }
+
+ if (!arg1 || !arg2)
+ goto fail_usage;
+
+ type1 = tapdisk_disktype_parse_params(arg1, &path1);
+ if (type1 < 0)
+ return type1;
+
+ if (type1 != DISK_TYPE_VHD) {
+ printf("error: first VDI is not VHD\n");
+ return EINVAL;
+ }
+
+ err = open_vhd(path1, &vhd1);
+ if (err)
+ return err;
+
+ tapdisk_start_logging("tapdisk-diff");
+
+ err = tapdisk_server_initialize();
+ if (err)
+ goto out;
+
+ err = tapdisk_stream_open(&stream1, arg1);
+ if (err) {
+ fprintf(stderr, "Failed to open %s: %s\n",
+ arg1, strerror(-err));
+ goto out;
+ }
+
+ err = tapdisk_stream_open(&stream2, arg2);
+ if (err) {
+ fprintf(stderr, "Failed to open %s: %s\n",
+ arg2, strerror(-err));
+ goto out1;
+ }
+
+ if (stream1.end != stream2.end) {
+ fprintf(stderr, "Image sizes differ: %"PRIu64" != %"PRIu64"\n",
+ stream1.end, stream2.end);
+ err = EINVAL;
+ goto out2;
+ }
+
+ tapdisk_server_run();
+
+out2:
+ tapdisk_stream_release(&stream2);
+out1:
+ tapdisk_stream_release(&stream1);
+out:
+ vhd_close(&vhd1);
+ tapdisk_stop_logging();
+
+ return err ? : stream1.err;
+
+fail_usage:
+ usage(stderr);
+ return 1;
+}
--- /dev/null
+/*
+ * Copyright (c) 2007, 2010, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <errno.h>
+
+#include "tapdisk-disktype.h"
+#include "tapdisk-message.h"
+
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof *(a))
+
+static const disk_info_t aio_disk = {
+ "aio",
+ "raw image (aio)",
+ 0,
+};
+
+static const disk_info_t sync_disk = {
+ "sync",
+ "raw image (sync)",
+ 0,
+};
+
+static const disk_info_t vmdk_disk = {
+ "vmdk",
+ "vmware image (vmdk)",
+ 1,
+};
+
+static const disk_info_t vhdsync_disk = {
+ "vhdsync",
+ "virtual server image (vhd) - synchronous",
+ 1,
+};
+
+static const disk_info_t vhd_disk = {
+ "vhd",
+ "virtual server image (vhd)",
+ 0,
+};
+
+
+static const disk_info_t ram_disk = {
+ "ram",
+ "ramdisk image (ram)",
+ 1,
+};
+
+static const disk_info_t qcow_disk = {
+ "qcow",
+ "qcow disk (qcow)",
+ 0,
+};
+
+static const disk_info_t block_cache_disk = {
+ "bc",
+ "block cache image (bc)",
+ 1,
+};
+
+static const disk_info_t vhd_index_disk = {
+ "vhdi",
+ "vhd index image (vhdi)",
+ 1,
+};
+
+static const disk_info_t log_disk = {
+ "log",
+ "write logger (log)",
+ 0,
+};
+
+static const disk_info_t remus_disk = {
+ "remus",
+ "remus disk replicator (remus)",
+ 0,
+};
+
+const disk_info_t *tapdisk_disk_types[] = {
+ [DISK_TYPE_AIO] = &aio_disk,
+ [DISK_TYPE_SYNC] = &sync_disk,
+ [DISK_TYPE_VMDK] = &vmdk_disk,
+ [DISK_TYPE_VHDSYNC] = &vhdsync_disk,
+ [DISK_TYPE_VHD] = &vhd_disk,
+ [DISK_TYPE_RAM] = &ram_disk,
+ [DISK_TYPE_QCOW] = &qcow_disk,
+ [DISK_TYPE_BLOCK_CACHE] = &block_cache_disk,
+ [DISK_TYPE_LOG] = &log_disk,
+ [DISK_TYPE_VINDEX] = &vhd_index_disk,
+ [DISK_TYPE_REMUS] = &remus_disk,
+};
+
+extern struct tap_disk tapdisk_aio;
+extern struct tap_disk tapdisk_vhdsync;
+extern struct tap_disk tapdisk_vhd;
+extern struct tap_disk tapdisk_ram;
+extern struct tap_disk tapdisk_qcow;
+extern struct tap_disk tapdisk_block_cache;
+extern struct tap_disk tapdisk_log;
+extern struct tap_disk tapdisk_remus;
+
+const struct tap_disk *tapdisk_disk_drivers[ARRAY_SIZE(tapdisk_disk_types)] = {
+ [DISK_TYPE_AIO] = &tapdisk_aio,
+ [DISK_TYPE_VHD] = &tapdisk_vhd,
+ [DISK_TYPE_RAM] = &tapdisk_ram,
+ [DISK_TYPE_QCOW] = &tapdisk_qcow,
+ [DISK_TYPE_BLOCK_CACHE] = &tapdisk_block_cache,
+ [DISK_TYPE_LOG] = &tapdisk_log,
+ [DISK_TYPE_REMUS] = &tapdisk_remus,
+};
+
+int
+tapdisk_disktype_find(const char *name)
+{
+ const disk_info_t *info;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tapdisk_disk_types); ++i) {
+ info = tapdisk_disk_types[i];
+ if (!info)
+ continue;
+
+ if (strcmp(name, info->name))
+ continue;
+
+ if (!tapdisk_disk_drivers[i])
+ return -ENOSYS;
+
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+int
+tapdisk_disktype_parse_params(const char *params, const char **_path)
+{
+ char name[DISK_TYPE_NAME_MAX], *ptr;
+ size_t len;
+ int type;
+
+ ptr = strchr(params, ':');
+ if (!ptr)
+ return -EINVAL;
+
+ len = ptr - params;
+
+ if (len > sizeof(name) - 1)
+ return -ENAMETOOLONG;
+
+ memset(name, 0, sizeof(name));
+ strncpy(name, params, len);
+
+ type = tapdisk_disktype_find(name);
+
+ if (type >= 0)
+ *_path = params + len + 1;
+
+ return type;
+}
+
+int
+tapdisk_parse_disk_type(const char *params, const char **_path, int *_type)
+{
+ int type;
+
+ type = tapdisk_disktype_parse_params(params, _path);
+ if (type < 0)
+ return type;
+
+ *_type = type;
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2007, 2010, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DISKTYPES_H__
+#define __DISKTYPES_H__
+
+#define DISK_TYPE_AIO 0
+#define DISK_TYPE_SYNC 1
+#define DISK_TYPE_VMDK 2
+#define DISK_TYPE_VHDSYNC 3
+#define DISK_TYPE_VHD 4
+#define DISK_TYPE_RAM 5
+#define DISK_TYPE_QCOW 6
+#define DISK_TYPE_BLOCK_CACHE 7
+#define DISK_TYPE_LOG 8
+#define DISK_TYPE_REMUS 9
+#define DISK_TYPE_VINDEX 10
+
+#define DISK_TYPE_NAME_MAX 32
+
+typedef struct disk_info {
+ const char *name; /* driver name, e.g. 'aio' */
+ char *desc; /* e.g. "raw image" */
+ unsigned int flags;
+} disk_info_t;
+
+extern const disk_info_t *tapdisk_disk_types[];
+extern const struct tap_disk *tapdisk_disk_drivers[];
+
+/* one single controller for all instances of disk type */
+#define DISK_TYPE_SINGLE_CONTROLLER (1<<0)
+
+int tapdisk_disktype_find(const char *name);
+int tapdisk_disktype_parse_params(const char *params, const char **_path);
+int tapdisk_parse_disk_type(const char *, const char **, int *);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdlib.h>
+
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-disktype.h"
+
+td_driver_t *
+tapdisk_driver_allocate(int type, char *name, td_flag_t flags, int storage)
+{
+ int err;
+ td_driver_t *driver;
+ const struct tap_disk *ops;
+
+ ops = tapdisk_disk_drivers[type];
+ if (!ops)
+ return NULL;
+
+ driver = calloc(1, sizeof(td_driver_t));
+ if (!driver)
+ return NULL;
+
+ err = tapdisk_namedup(&driver->name, name);
+ if (err)
+ goto fail;
+
+ driver->ops = ops;
+ driver->type = type;
+ driver->storage = storage;
+ driver->data = calloc(1, ops->private_data_size);
+ if (!driver->data)
+ goto fail;
+
+ if (td_flag_test(flags, TD_OPEN_RDONLY))
+ td_flag_set(driver->state, TD_DRIVER_RDONLY);
+
+ return driver;
+
+fail:
+ free(driver->name);
+ free(driver->data);
+ free(driver);
+ return NULL;
+}
+
+void
+tapdisk_driver_free(td_driver_t *driver)
+{
+ if (!driver)
+ return;
+
+ if (driver->refcnt)
+ return;
+
+ if (td_flag_test(driver->state, TD_DRIVER_OPEN))
+ EPRINTF("freeing open driver %s (state 0x%08x)\n",
+ driver->name, driver->state);
+
+ free(driver->name);
+ free(driver->data);
+ free(driver);
+}
+
+void
+tapdisk_driver_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb)
+{
+ tapdisk_server_queue_tiocb(tiocb);
+}
+
+void
+tapdisk_driver_debug(td_driver_t *driver)
+{
+ if (driver->ops->td_debug)
+ driver->ops->td_debug(driver);
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_DRIVER_H_
+#define _TAPDISK_DRIVER_H_
+
+#include "tapdisk.h"
+#include "scheduler.h"
+#include "tapdisk-queue.h"
+
+#define TD_DRIVER_OPEN 0x0001
+#define TD_DRIVER_RDONLY 0x0002
+
+struct td_driver_handle {
+ int type;
+ char *name;
+
+ int storage;
+
+ int refcnt;
+ td_flag_t state;
+
+ td_disk_info_t info;
+
+ void *data;
+ const struct tap_disk *ops;
+
+ struct list_head next;
+};
+
+td_driver_t *tapdisk_driver_allocate(int, char *, td_flag_t, int);
+void tapdisk_driver_free(td_driver_t *);
+
+void tapdisk_driver_queue_tiocb(td_driver_t *, struct tiocb *);
+
+void tapdisk_driver_debug(td_driver_t *);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <libaio.h>
+#include <syslog.h>
+#include <sys/time.h>
+
+#include "tapdisk-log.h"
+#include "tapdisk-filter.h"
+
+#define RSEED 7
+#define PRE_CHECK 0
+#define POST_CHECK 1
+
+#define WRITE_INTEGRITY "buffer integrity failure after write"
+#define READ_INTEGRITY "disk integrity failure after read"
+
+#define DBG(f, a...) tlog_write(TLOG_WARN, f, ##a)
+
+/*
+ * simulate IO errors by knocking request size to zero before
+ * submitting and restoring original size before returning
+ */
+static inline void
+inject_fault(struct tfilter *filter, struct iocb *io)
+{
+ struct fiocb *fio;
+
+ if (!filter->ffree)
+ return;
+
+ fio = filter->flist[--filter->ffree];
+
+ fio->bytes = io->u.c.nbytes;
+ fio->data = io->data;
+ io->u.c.nbytes = 0;
+ io->data = fio;
+}
+
+static inline int
+fault_injected(struct tfilter *filter, struct iocb *io)
+{
+ unsigned long iop = (unsigned long)io->data;
+ unsigned long start = (unsigned long)filter->fiocbs;
+ unsigned long end = start + (filter->iocbs * sizeof(struct fiocb));
+
+ return (iop >= start && iop < end);
+}
+
+static inline void
+recover_fault(struct tfilter *filter, struct iocb *io)
+{
+ struct fiocb *fio = (struct fiocb *)io->data;
+
+ io->u.c.nbytes = fio->bytes;
+ io->data = fio->data;
+
+ memset(fio, 0, sizeof(struct fiocb));
+ filter->flist[filter->ffree++] = fio;
+}
+
+static inline uint64_t
+chksum(char *buf)
+{
+ int i, num = 512 >> 3;
+ uint64_t *p = (uint64_t *)buf;
+ uint64_t sum = 0;
+
+ for (i = 0; i < num; i++)
+ sum += p[i];
+
+ return sum;
+}
+
+static inline void
+check_hash(struct tfilter *filter, uint64_t sec, char *buf, char *type)
+{
+ uint64_t sum;
+ struct dhash *hash;
+
+ hash = filter->dhash + sec;
+ if (!hash->time.tv_sec)
+ return;
+
+ sum = chksum(buf);
+ if (hash->hash != chksum(buf)) {
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ DBG("%s: hash table: 0x%020" PRIx64 " at %012lu.%06llu, "
+ "from disk: 0x%020" PRIx64 " at %012lu.%06llu\n",
+ type, hash->hash, hash->time.tv_sec,
+ (unsigned long long)hash->time.tv_usec, sum,
+ now.tv_sec, (unsigned long long)now.tv_usec);
+ }
+}
+
+static inline void
+insert_hash(struct tfilter *filter, uint64_t sec, char *buf)
+{
+ struct dhash *hash;
+
+ hash = filter->dhash + sec;
+ hash->hash = chksum(buf);
+ gettimeofday(&hash->time, NULL);
+}
+
+static void
+check_sector(struct tfilter *filter, int type, int rw, uint64_t sec, char *buf)
+{
+ struct dhash *hash;
+
+ if (sec >= filter->secs)
+ return;
+
+ hash = filter->dhash + sec;
+
+ if (rw) {
+ if (type == PRE_CHECK)
+ insert_hash(filter, sec, buf);
+ else
+ check_hash(filter, sec, buf, WRITE_INTEGRITY);
+ } else if (type == POST_CHECK) {
+ check_hash(filter, sec, buf, READ_INTEGRITY);
+ insert_hash(filter, sec, buf);
+ }
+}
+
+static void
+check_data(struct tfilter *filter, int type, struct iocb *io)
+{
+ int rw;
+ uint64_t i, sec;
+
+ rw = (io->aio_lio_opcode == IO_CMD_PWRITE);
+
+ for (i = 0; i < io->u.c.nbytes; i += 512) {
+ char *buf = io->u.c.buf + i;
+ uint64_t sec = (io->u.c.offset + i) >> 9;
+ check_sector(filter, type, rw, sec, buf);
+ }
+}
+
+struct tfilter *
+tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs)
+{
+ int i;
+ struct tfilter *filter = NULL;
+
+ if (!mode)
+ return NULL;
+
+ filter = calloc(1, sizeof(struct tfilter));
+ if (!filter)
+ goto fail;
+
+ filter->mode = mode;
+ filter->secs = secs;
+ filter->iocbs = iocbs;
+
+ if (filter->mode & TD_INJECT_FAULTS) {
+ filter->fiocbs = calloc(iocbs, sizeof(struct fiocb));
+ filter->flist = calloc(iocbs, sizeof(struct fiocb *));
+ if (!filter->fiocbs || !filter->flist)
+ filter->mode &= ~TD_INJECT_FAULTS;
+ else {
+ srand(RSEED);
+ filter->ffree = iocbs;
+ for (i = 0; i < iocbs; i++)
+ filter->flist[i] = filter->fiocbs + i;
+ }
+ }
+
+ if (filter->mode & TD_CHECK_INTEGRITY) {
+ filter->dhash = calloc(secs, sizeof(struct dhash));
+ if (!filter->dhash)
+ filter->mode &= ~TD_CHECK_INTEGRITY;
+ }
+
+ syslog(LOG_WARNING, "WARNING: "
+ "FILTERING IN MODE 0x%04x\n", filter->mode);
+
+ return filter;
+
+ fail:
+ tapdisk_free_tfilter(filter);
+ return NULL;
+}
+
+void
+tapdisk_free_tfilter(struct tfilter *filter)
+{
+ if (!filter)
+ return;
+
+ free(filter->dhash);
+ free(filter->flist);
+ free(filter->fiocbs);
+ free(filter);
+}
+
+void
+tapdisk_filter_iocbs(struct tfilter *filter, struct iocb **iocbs, int num)
+{
+ int i;
+
+ if (!filter)
+ return;
+
+ for (i = 0; i < num; i++) {
+ struct iocb *io = iocbs[i];
+
+ if (filter->mode & TD_INJECT_FAULTS) {
+ if ((random() % 100) <= TD_FAULT_RATE) {
+ inject_fault(filter, io);
+ continue;
+ }
+ }
+
+ if (filter->mode & TD_CHECK_INTEGRITY)
+ check_data(filter, PRE_CHECK, io);
+ }
+}
+
+void
+tapdisk_filter_events(struct tfilter *filter, struct io_event *events, int num)
+{
+ int i;
+
+ if (!filter)
+ return;
+
+ for (i = 0; i < num; i++) {
+ struct iocb *io = events[i].obj;
+
+ if (filter->mode & TD_INJECT_FAULTS) {
+ if (fault_injected(filter, io)) {
+ recover_fault(filter, io);
+ continue;
+ }
+ }
+
+ if (filter->mode & TD_CHECK_INTEGRITY)
+ check_data(filter, POST_CHECK, io);
+ }
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef TAPDISK_FILTER_H
+#define TAPDISK_FILTER_H
+
+#include <libaio.h>
+#include <inttypes.h>
+#include <time.h>
+
+#define TD_INJECT_FAULTS 0x00001 /* simulate random IO failures */
+#define TD_CHECK_INTEGRITY 0x00002 /* check data integrity */
+
+#define TD_FAULT_RATE 5
+
+struct dhash {
+ uint64_t hash;
+ struct timeval time;
+};
+
+struct fiocb {
+ size_t bytes;
+ void *data;
+};
+
+struct tfilter {
+ int mode;
+ uint64_t secs;
+ int iocbs;
+
+ struct dhash *dhash;
+
+ int ffree;
+ struct fiocb *fiocbs;
+ struct fiocb **flist;
+};
+
+struct tfilter *tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs);
+void tapdisk_free_tfilter(struct tfilter *);
+void tapdisk_filter_iocbs(struct tfilter *, struct iocb **, int);
+void tapdisk_filter_events(struct tfilter *, struct io_event *, int);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#ifdef MEMSHR
+#include <memshr.h>
+#endif
+
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+td_image_t *
+tapdisk_image_allocate(const char *file, int type, int storage,
+ td_flag_t flags, void *private)
+{
+ int err;
+ td_image_t *image;
+
+ image = calloc(1, sizeof(td_image_t));
+ if (!image)
+ return NULL;
+
+ err = tapdisk_namedup(&image->name, file);
+ if (err) {
+ free(image);
+ return NULL;
+ }
+
+ image->type = type;
+ image->flags = flags;
+ image->storage = storage;
+ image->private = private;
+#ifdef MEMSHR
+ image->memshr_id = memshr_vbd_image_get(file);
+#endif
+ INIT_LIST_HEAD(&image->next);
+
+ return image;
+}
+
+void
+tapdisk_image_free(td_image_t *image)
+{
+ if (!image)
+ return;
+
+ list_del(&image->next);
+
+#ifdef MEMSHR
+ memshr_vbd_image_put(image->memshr_id);
+#endif
+ free(image->name);
+ tapdisk_driver_free(image->driver);
+ free(image);
+}
+
+int
+tapdisk_image_check_td_request(td_image_t *image, td_request_t treq)
+{
+ int rdonly;
+ td_driver_t *driver;
+ td_disk_info_t *info;
+
+ driver = image->driver;
+ if (!driver)
+ return -ENODEV;
+
+ info = &driver->info;
+ rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY);
+
+ if (treq.op != TD_OP_READ && treq.op != TD_OP_WRITE)
+ goto fail;
+
+ if (treq.op == TD_OP_WRITE && rdonly)
+ goto fail;
+
+ if (treq.secs <= 0 || treq.sec + treq.secs > info->size)
+ goto fail;
+
+ return 0;
+
+fail:
+ ERR(-EINVAL, "bad td request on %s (%s, %"PRIu64"): %d at %"PRIu64,
+ image->name, (rdonly ? "ro" : "rw"), info->size, treq.op,
+ treq.sec + treq.secs);
+ return -EINVAL;
+
+}
+
+int
+tapdisk_image_check_ring_request(td_image_t *image, blkif_request_t *req)
+{
+ td_driver_t *driver;
+ td_disk_info_t *info;
+ int i, psize, rdonly;
+ uint64_t nsects, total;
+
+ driver = image->driver;
+ if (!driver)
+ return -ENODEV;
+
+ nsects = 0;
+ total = 0;
+ info = &driver->info;
+
+ rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY);
+
+ if (req->operation != BLKIF_OP_READ &&
+ req->operation != BLKIF_OP_WRITE)
+ goto fail;
+
+ if (req->operation == BLKIF_OP_WRITE && rdonly)
+ goto fail;
+
+ if (!req->nr_segments || req->nr_segments > MAX_SEGMENTS_PER_REQ)
+ goto fail;
+
+ total = 0;
+ psize = getpagesize();
+
+ for (i = 0; i < req->nr_segments; i++) {
+ nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1;
+
+ if (req->seg[i].last_sect >= psize >> 9 || nsects <= 0)
+ goto fail;
+
+ total += nsects;
+ }
+
+ if (req->sector_number + nsects > info->size)
+ goto fail;
+
+ return 0;
+
+fail:
+ ERR(-EINVAL, "bad request on %s (%s, %"PRIu64"): id: %"PRIu64": %d at %"PRIu64,
+ image->name, (rdonly ? "ro" : "rw"), info->size, req->id,
+ req->operation, req->sector_number + total);
+ return -EINVAL;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_IMAGE_H_
+#define _TAPDISK_IMAGE_H_
+
+#include "tapdisk.h"
+#include <xen/io/blkif.h>
+
+struct td_image_handle {
+ int type;
+ char *name;
+ uint16_t memshr_id;
+
+ td_flag_t flags;
+ int storage;
+
+ td_driver_t *driver;
+ td_disk_info_t info;
+
+ void *private;
+
+ struct list_head next;
+};
+
+td_image_t *tapdisk_image_allocate(const char *, int, int, td_flag_t, void *);
+void tapdisk_image_free(td_image_t *);
+
+int tapdisk_image_check_td_request(td_image_t *, td_request_t);
+int tapdisk_image_check_ring_request(td_image_t *, blkif_request_t *);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+
+#include "tapdisk.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+int
+td_load(td_image_t *image)
+{
+ int err;
+ td_image_t *shared;
+ td_driver_t *driver;
+
+ shared = tapdisk_server_get_shared_image(image);
+ if (!shared)
+ return -ENODEV;
+
+ driver = shared->driver;
+ if (!driver)
+ return -EBADF;
+
+ driver->refcnt++;
+ image->driver = driver;
+ image->info = driver->info;
+
+ DPRINTF("loaded shared image %s (%d users, state: 0x%08x, type: %d)\n",
+ driver->name, driver->refcnt, driver->state, driver->type);
+ return 0;
+}
+
+int
+__td_open(td_image_t *image, td_disk_info_t *info)
+{
+ int err;
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver) {
+ driver = tapdisk_driver_allocate(image->type,
+ image->name,
+ image->flags,
+ image->storage);
+ if (!driver)
+ return -ENOMEM;
+
+ if (info) /* pre-seed driver->info for virtual drivers */
+ driver->info = *info;
+ }
+
+ if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+ err = driver->ops->td_open(driver, image->name, image->flags);
+ if (err) {
+ if (!image->driver)
+ tapdisk_driver_free(driver);
+ return err;
+ }
+
+ td_flag_set(driver->state, TD_DRIVER_OPEN);
+ DPRINTF("opened image %s (%d users, state: 0x%08x, type: %d)\n",
+ driver->name, driver->refcnt + 1,
+ driver->state, driver->type);
+ }
+
+ image->driver = driver;
+ image->info = driver->info;
+ driver->refcnt++;
+ return 0;
+}
+
+int
+td_open(td_image_t *image)
+{
+ return __td_open(image, NULL);
+}
+
+int
+td_close(td_image_t *image)
+{
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver)
+ return -ENODEV;
+
+ driver->refcnt--;
+ if (!driver->refcnt && td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+ driver->ops->td_close(driver);
+ td_flag_clear(driver->state, TD_DRIVER_OPEN);
+ }
+
+ DPRINTF("closed image %s (%d users, state: 0x%08x, type: %d)\n",
+ driver->name, driver->refcnt, driver->state, driver->type);
+
+ return 0;
+}
+
+int
+td_get_parent_id(td_image_t *image, td_disk_id_t *id)
+{
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver)
+ return -ENODEV;
+
+ if (!td_flag_test(driver->state, TD_DRIVER_OPEN))
+ return -EBADF;
+
+ return driver->ops->td_get_parent_id(driver, id);
+}
+
+int
+td_validate_parent(td_image_t *image, td_image_t *parent)
+{
+ td_driver_t *driver, *pdriver;
+
+ driver = image->driver;
+ pdriver = parent->driver;
+ if (!driver || !pdriver)
+ return -ENODEV;
+
+ if (!td_flag_test(driver->state, TD_DRIVER_OPEN) ||
+ !td_flag_test(pdriver->state, TD_DRIVER_OPEN))
+ return -EBADF;
+
+ return 0;
+ return driver->ops->td_validate_parent(driver, pdriver, 0);
+}
+
+void
+td_queue_write(td_image_t *image, td_request_t treq)
+{
+ int err;
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver) {
+ err = -ENODEV;
+ goto fail;
+ }
+
+ if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+ err = -EBADF;
+ goto fail;
+ }
+
+ err = tapdisk_image_check_td_request(image, treq);
+ if (err)
+ goto fail;
+
+ driver->ops->td_queue_write(driver, treq);
+ return;
+
+fail:
+ td_complete_request(treq, err);
+}
+
+void
+td_queue_read(td_image_t *image, td_request_t treq)
+{
+ int err;
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver) {
+ err = -ENODEV;
+ goto fail;
+ }
+
+ if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+ err = -EBADF;
+ goto fail;
+ }
+
+ err = tapdisk_image_check_td_request(image, treq);
+ if (err)
+ goto fail;
+
+ driver->ops->td_queue_read(driver, treq);
+ return;
+
+fail:
+ td_complete_request(treq, err);
+}
+
+void
+td_forward_request(td_request_t treq)
+{
+ tapdisk_vbd_forward_request(treq);
+}
+
+void
+td_complete_request(td_request_t treq, int res)
+{
+ ((td_callback_t)treq.cb)(treq, res);
+}
+
+void
+td_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb)
+{
+ tapdisk_driver_queue_tiocb(driver, tiocb);
+}
+
+void
+td_prep_read(struct tiocb *tiocb, int fd, char *buf, size_t bytes,
+ long long offset, td_queue_callback_t cb, void *arg)
+{
+ tapdisk_prep_tiocb(tiocb, fd, 0, buf, bytes, offset, cb, arg);
+}
+
+void
+td_prep_write(struct tiocb *tiocb, int fd, char *buf, size_t bytes,
+ long long offset, td_queue_callback_t cb, void *arg)
+{
+ tapdisk_prep_tiocb(tiocb, fd, 1, buf, bytes, offset, cb, arg);
+}
+
+void
+td_debug(td_image_t *image)
+{
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver || !td_flag_test(driver->state, TD_DRIVER_OPEN))
+
+ return;
+
+ tapdisk_driver_debug(driver);
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_INTERFACE_H_
+#define _TAPDISK_INTERFACE_H_
+
+#include "tapdisk.h"
+#include "tapdisk-queue.h"
+
+int td_open(td_image_t *);
+int __td_open(td_image_t *, td_disk_info_t *);
+int td_load(td_image_t *);
+int td_close(td_image_t *);
+int td_get_parent_id(td_image_t *, td_disk_id_t *);
+int td_validate_parent(td_image_t *, td_image_t *);
+
+void td_queue_write(td_image_t *, td_request_t);
+void td_queue_read(td_image_t *, td_request_t);
+void td_forward_request(td_request_t);
+void td_complete_request(td_request_t, int);
+
+void td_debug(td_image_t *);
+
+void td_queue_tiocb(td_driver_t *, struct tiocb *);
+void td_prep_read(struct tiocb *, int, char *, size_t,
+ long long, td_queue_callback_t, void *);
+void td_prep_write(struct tiocb *, int, char *, size_t,
+ long long, td_queue_callback_t, void *);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <syslog.h>
+#include <inttypes.h>
+#include <sys/time.h>
+
+#include "tapdisk-log.h"
+#include "tapdisk-utils.h"
+
+#define MAX_ENTRY_LEN 512
+#define MAX_ERROR_MESSAGES 16
+
+struct error {
+ int cnt;
+ int err;
+ char *func;
+ char msg[MAX_ENTRY_LEN];
+};
+
+struct ehandle {
+ int cnt;
+ int dropped;
+ struct error errors[MAX_ERROR_MESSAGES];
+};
+
+struct tlog {
+ char *p;
+ int size;
+ uint64_t cnt;
+ char *buf;
+ int level;
+ char *file;
+ int append;
+};
+
+static struct ehandle tapdisk_err;
+static struct tlog tapdisk_log;
+
+void
+open_tlog(char *file, size_t bytes, int level, int append)
+{
+ tapdisk_log.size = ((bytes + 511) & (~511));
+
+ if (asprintf(&tapdisk_log.file, "%s.%d", file, getpid()) == -1)
+ return;
+
+ if (posix_memalign((void **)&tapdisk_log.buf, 512, tapdisk_log.size)) {
+ free(tapdisk_log.file);
+ tapdisk_log.buf = NULL;
+ return;
+ }
+
+ memset(tapdisk_log.buf, 0, tapdisk_log.size);
+
+ tapdisk_log.p = tapdisk_log.buf;
+ tapdisk_log.level = level;
+ tapdisk_log.append = append;
+}
+
+void
+close_tlog(void)
+{
+ if (!tapdisk_log.buf)
+ return;
+
+ if (tapdisk_log.append)
+ tlog_flush();
+
+ free(tapdisk_log.buf);
+ free(tapdisk_log.file);
+
+ memset(&tapdisk_log, 0, sizeof(struct tlog));
+}
+
+void
+__tlog_write(int level, const char *func, const char *fmt, ...)
+{
+ char *buf;
+ va_list ap;
+ struct timeval t;
+ int ret, len, avail;
+
+ if (!tapdisk_log.buf)
+ return;
+
+ if (level > tapdisk_log.level)
+ return;
+
+ avail = tapdisk_log.size - (tapdisk_log.p - tapdisk_log.buf);
+ if (avail < MAX_ENTRY_LEN) {
+ if (tapdisk_log.append)
+ tlog_flush();
+ tapdisk_log.p = tapdisk_log.buf;
+ }
+
+ buf = tapdisk_log.p;
+ gettimeofday(&t, NULL);
+ len = snprintf(buf, MAX_ENTRY_LEN - 1, "%08"PRIu64":%010ld.%06lld:"
+ "%s ", tapdisk_log.cnt,
+ t.tv_sec, (unsigned long long)t.tv_usec, func);
+
+ va_start(ap, fmt);
+ ret = vsnprintf(buf + len, MAX_ENTRY_LEN - (len + 1), fmt, ap);
+ va_end(ap);
+
+ len = (ret < MAX_ENTRY_LEN - (len + 1) ?
+ len + ret : MAX_ENTRY_LEN - 1);
+ buf[len] = '\0';
+
+ tapdisk_log.cnt++;
+ tapdisk_log.p += len;
+}
+
+void
+__tlog_error(int err, const char *func, const char *fmt, ...)
+{
+ va_list ap;
+ int i, len, ret;
+ struct error *e;
+ struct timeval t;
+
+ err = (err > 0 ? err : -err);
+
+ for (i = 0; i < tapdisk_err.cnt; i++) {
+ e = &tapdisk_err.errors[i];
+ if (e->err == err && e->func == func) {
+ e->cnt++;
+ return;
+ }
+ }
+
+ if (tapdisk_err.cnt >= MAX_ERROR_MESSAGES) {
+ tapdisk_err.dropped++;
+ return;
+ }
+
+ gettimeofday(&t, NULL);
+ e = &tapdisk_err.errors[tapdisk_err.cnt];
+
+ len = snprintf(e->msg, MAX_ENTRY_LEN - 1, "%010ld.%06lld:%s ",
+ t.tv_sec, (unsigned long long)t.tv_usec, func);
+
+ va_start(ap, fmt);
+ ret = vsnprintf(e->msg + len, MAX_ENTRY_LEN - (len + 1), fmt, ap);
+ va_end(ap);
+
+ len = (ret < MAX_ENTRY_LEN - (len + 1) ?
+ len + ret : MAX_ENTRY_LEN - 1);
+ e->msg[len] = '\0';
+
+ e->cnt++;
+ e->err = err;
+ e->func = (char *)func;
+ tapdisk_err.cnt++;
+}
+
+void
+tlog_print_errors(void)
+{
+ int i;
+ struct error *e;
+
+ for (i = 0; i < tapdisk_err.cnt; i++) {
+ e = &tapdisk_err.errors[i];
+ syslog(LOG_INFO, "TAPDISK ERROR: errno %d at %s (cnt = %d): "
+ "%s\n", e->err, e->func, e->cnt, e->msg);
+ }
+
+ if (tapdisk_err.dropped)
+ syslog(LOG_INFO, "TAPDISK ERROR: %d other error messages "
+ "dropped\n", tapdisk_err.dropped);
+}
+
+void
+tlog_flush_errors(void)
+{
+ int i;
+ struct error *e;
+
+ for (i = 0; i < tapdisk_err.cnt; i++) {
+ e = &tapdisk_err.errors[i];
+ tlog_write(TLOG_WARN, "TAPDISK ERROR: errno %d at %s "
+ "(cnt = %d): %s\n", e->err, e->func, e->cnt,
+ e->msg);
+ }
+
+ if (tapdisk_err.dropped)
+ tlog_write(TLOG_WARN, "TAPDISK ERROR: %d other error messages "
+ "dropped\n", tapdisk_err.dropped);
+}
+
+void
+tlog_flush(void)
+{
+ int fd, flags;
+ size_t size, wsize;
+
+ if (!tapdisk_log.buf)
+ return;
+
+ flags = O_CREAT | O_WRONLY | O_DIRECT | O_NONBLOCK;
+ if (!tapdisk_log.append)
+ flags |= O_TRUNC;
+
+ fd = open(tapdisk_log.file, flags, 0644);
+ if (fd == -1)
+ return;
+
+ if (tapdisk_log.append)
+ if (lseek(fd, 0, SEEK_END) == (off_t)-1)
+ goto out;
+
+ tlog_flush_errors();
+
+ size = tapdisk_log.p - tapdisk_log.buf;
+ wsize = ((size + 511) & (~511));
+
+ memset(tapdisk_log.buf + size, '\n', wsize - size);
+ write_exact(fd, tapdisk_log.buf, wsize);
+
+ tapdisk_log.p = tapdisk_log.buf;
+
+out:
+ close(fd);
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_LOG_H_
+#define _TAPDISK_LOG_H_
+
+#define TLOG_WARN 0
+#define TLOG_INFO 1
+#define TLOG_DBG 2
+
+void open_tlog(char *file, size_t bytes, int level, int append);
+void close_tlog(void);
+void tlog_flush(void);
+void tlog_print_errors(void);
+
+void __tlog_write(int level, const char *func, const char *fmt, ...)
+ __attribute__((format(printf, 3, 4)));
+void __tlog_error(int err, const char *func, const char *fmt, ...)
+ __attribute__((format(printf, 3, 4)));
+
+#define tlog_write(_level, _f, _a...) \
+ __tlog_write(_level, __func__, _f, ##_a)
+
+#define tlog_error(_err, _f, _a...) \
+ __tlog_error(_err, __func__, _f, ##_a)
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libaio.h>
+#ifdef __linux__
+#include <linux/version.h>
+#endif
+
+#include "tapdisk.h"
+#include "tapdisk-log.h"
+#include "tapdisk-queue.h"
+#include "tapdisk-filter.h"
+#include "tapdisk-server.h"
+#include "tapdisk-utils.h"
+
+#include "libaio-compat.h"
+#include "atomicio.h"
+
+#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+/*
+ * We used a kernel patch to return an fd associated with the AIO context
+ * so that we can concurrently poll on synchronous and async descriptors.
+ * This is signalled by passing 1 as the io context to io_setup.
+ */
+#define REQUEST_ASYNC_FD ((io_context_t)1)
+
+static inline void
+queue_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+ struct iocb *iocb = &tiocb->iocb;
+
+ if (queue->queued) {
+ struct tiocb *prev = (struct tiocb *)
+ queue->iocbs[queue->queued - 1]->data;
+ prev->next = tiocb;
+ }
+
+ queue->iocbs[queue->queued++] = iocb;
+}
+
+static inline int
+deferred_tiocbs(struct tqueue *queue)
+{
+ return (queue->deferred.head != NULL);
+}
+
+static inline void
+defer_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+ struct tlist *list = &queue->deferred;
+
+ if (!list->head)
+ list->head = list->tail = tiocb;
+ else
+ list->tail = list->tail->next = tiocb;
+
+ queue->tiocbs_deferred++;
+ queue->deferrals++;
+}
+
+static inline void
+queue_deferred_tiocb(struct tqueue *queue)
+{
+ struct tlist *list = &queue->deferred;
+
+ if (list->head) {
+ struct tiocb *tiocb = list->head;
+
+ list->head = tiocb->next;
+ if (!list->head)
+ list->tail = NULL;
+
+ queue_tiocb(queue, tiocb);
+ queue->tiocbs_deferred--;
+ }
+}
+
+static inline void
+queue_deferred_tiocbs(struct tqueue *queue)
+{
+ while (!tapdisk_queue_full(queue) && deferred_tiocbs(queue))
+ queue_deferred_tiocb(queue);
+}
+
+/*
+ * td_complete may queue more tiocbs
+ */
+static void
+complete_tiocb(struct tqueue *queue, struct tiocb *tiocb, unsigned long res)
+{
+ int err;
+ struct iocb *iocb = &tiocb->iocb;
+
+ if (res == iocb->u.c.nbytes)
+ err = 0;
+ else if ((int)res < 0)
+ err = (int)res;
+ else
+ err = -EIO;
+
+ tiocb->cb(tiocb->arg, tiocb, err);
+}
+
+static int
+cancel_tiocbs(struct tqueue *queue, int err)
+{
+ int queued;
+ struct tiocb *tiocb;
+
+ if (!queue->queued)
+ return 0;
+
+ /*
+ * td_complete may queue more tiocbs, which
+ * will overwrite the contents of queue->iocbs.
+ * use a private linked list to keep track
+ * of the tiocbs we're cancelling.
+ */
+ tiocb = queue->iocbs[0]->data;
+ queued = queue->queued;
+ queue->queued = 0;
+
+ for (; tiocb != NULL; tiocb = tiocb->next)
+ complete_tiocb(queue, tiocb, err);
+
+ return queued;
+}
+
+static int
+fail_tiocbs(struct tqueue *queue, int succeeded, int total, int err)
+{
+ ERR(err, "io_submit error: %d of %d failed",
+ total - succeeded, total);
+
+ /* take any non-submitted, merged iocbs
+ * off of the queue, split them, and fail them */
+ queue->queued = io_expand_iocbs(&queue->opioctx,
+ queue->iocbs, succeeded, total);
+
+ return cancel_tiocbs(queue, err);
+}
+
+/*
+ * rwio
+ */
+
+struct rwio {
+ struct io_event *aio_events;
+};
+
+static void
+tapdisk_rwio_destroy(struct tqueue *queue)
+{
+ struct rwio *rwio = queue->tio_data;
+
+ if (rwio->aio_events) {
+ free(rwio->aio_events);
+ rwio->aio_events = NULL;
+ }
+}
+
+static int
+tapdisk_rwio_setup(struct tqueue *queue, int size)
+{
+ struct rwio *rwio = queue->tio_data;
+ int err;
+
+ rwio->aio_events = calloc(size, sizeof(struct io_event));
+ if (!rwio->aio_events)
+ return -errno;
+
+ return 0;
+}
+
+static inline ssize_t
+tapdisk_rwio_rw(const struct iocb *iocb)
+{
+ int fd = iocb->aio_fildes;
+ char *buf = iocb->u.c.buf;
+ long long off = iocb->u.c.offset;
+ size_t size = iocb->u.c.nbytes;
+ ssize_t (*func)(int, void *, size_t) =
+ (iocb->aio_lio_opcode == IO_CMD_PWRITE ? vwrite : read);
+
+ if (lseek(fd, off, SEEK_SET) == (off_t)-1)
+ return -errno;
+
+ if (atomicio(func, fd, buf, size) != size)
+ return -errno;
+
+ return size;
+}
+
+static int
+tapdisk_rwio_submit(struct tqueue *queue)
+{
+ struct rwio *rwio = queue->tio_data;
+ int i, merged, split;
+ struct iocb *iocb;
+ struct tiocb *tiocb;
+ struct io_event *ep;
+
+ if (!queue->queued)
+ return 0;
+
+ tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued);
+ merged = io_merge(&queue->opioctx, queue->iocbs, queue->queued);
+
+ queue->queued = 0;
+
+ for (i = 0; i < merged; i++) {
+ ep = rwio->aio_events + i;
+ iocb = queue->iocbs[i];
+ ep->obj = iocb;
+ ep->res = tapdisk_rwio_rw(iocb);
+ }
+
+ split = io_split(&queue->opioctx, rwio->aio_events, merged);
+ tapdisk_filter_events(queue->filter, rwio->aio_events, split);
+
+ for (i = split, ep = rwio->aio_events; i-- > 0; ep++) {
+ iocb = ep->obj;
+ tiocb = iocb->data;
+ complete_tiocb(queue, tiocb, ep->res);
+ }
+
+ queue_deferred_tiocbs(queue);
+
+ return split;
+}
+
+static const struct tio td_tio_rwio = {
+ .name = "rwio",
+ .data_size = 0,
+ .tio_setup = NULL,
+ .tio_destroy = NULL,
+ .tio_submit = tapdisk_rwio_submit
+};
+
+/*
+ * libaio
+ */
+
+struct lio {
+ io_context_t aio_ctx;
+ struct io_event *aio_events;
+
+ int event_fd;
+ int event_id;
+
+ int flags;
+};
+
+#define LIO_FLAG_EVENTFD (1<<0)
+
+static int
+tapdisk_lio_check_resfd(void)
+{
+#if defined(__linux__)
+ return tapdisk_linux_version() >= KERNEL_VERSION(2, 6, 22);
+#else
+ return 1;
+#endif
+}
+
+static void
+tapdisk_lio_destroy_aio(struct tqueue *queue)
+{
+ struct lio *lio = queue->tio_data;
+
+ if (lio->event_fd >= 0) {
+ close(lio->event_fd);
+ lio->event_fd = -1;
+ }
+
+ if (lio->aio_ctx) {
+ io_destroy(lio->aio_ctx);
+ lio->aio_ctx = 0;
+ }
+}
+
+static int
+__lio_setup_aio_poll(struct tqueue *queue, int qlen)
+{
+ struct lio *lio = queue->tio_data;
+ int err, fd;
+
+ lio->aio_ctx = REQUEST_ASYNC_FD;
+
+ fd = io_setup(qlen, &lio->aio_ctx);
+ if (fd < 0) {
+ lio->aio_ctx = 0;
+ err = -errno;
+
+ if (err == -EINVAL)
+ goto fail_fd;
+
+ goto fail;
+ }
+
+ lio->event_fd = fd;
+
+ return 0;
+
+fail_fd:
+ DPRINTF("Couldn't get fd for AIO poll support. This is probably "
+ "because your kernel does not have the aio-poll patch "
+ "applied.\n");
+fail:
+ return err;
+}
+
+static int
+__lio_setup_aio_eventfd(struct tqueue *queue, int qlen)
+{
+ struct lio *lio = queue->tio_data;
+ int err;
+
+ err = io_setup(qlen, &lio->aio_ctx);
+ if (err < 0) {
+ lio->aio_ctx = 0;
+ return err;
+ }
+
+ lio->event_fd = tapdisk_sys_eventfd(0);
+ if (lio->event_fd < 0)
+ return -errno;
+
+ lio->flags |= LIO_FLAG_EVENTFD;
+
+ return 0;
+}
+
+static int
+tapdisk_lio_setup_aio(struct tqueue *queue, int qlen)
+{
+ struct lio *lio = queue->tio_data;
+ int err;
+
+ lio->aio_ctx = 0;
+ lio->event_fd = -1;
+
+ /*
+ * prefer the mainline eventfd(2) api, if available.
+ * if not, fall back to the poll fd patch.
+ */
+
+ err = !tapdisk_lio_check_resfd();
+ if (!err)
+ err = __lio_setup_aio_eventfd(queue, qlen);
+ if (err)
+ err = __lio_setup_aio_poll(queue, qlen);
+
+ if (err == -EAGAIN)
+ goto fail_rsv;
+fail:
+ return err;
+
+fail_rsv:
+ DPRINTF("Couldn't setup AIO context. If you are trying to "
+ "concurrently use a large number of blktap-based disks, you may "
+ "need to increase the system-wide aio request limit. "
+ "(e.g. 'echo 1048576 > /proc/sys/fs/aio-max-nr')\n");
+ goto fail;
+}
+
+
+static void
+tapdisk_lio_destroy(struct tqueue *queue)
+{
+ struct lio *lio = queue->tio_data;
+
+ if (!lio)
+ return;
+
+ if (lio->event_id >= 0) {
+ tapdisk_server_unregister_event(lio->event_id);
+ lio->event_id = -1;
+ }
+
+ tapdisk_lio_destroy_aio(queue);
+
+ if (lio->aio_events) {
+ free(lio->aio_events);
+ lio->aio_events = NULL;
+ }
+}
+
+static void
+tapdisk_lio_set_eventfd(struct tqueue *queue, int n, struct iocb **iocbs)
+{
+ struct lio *lio = queue->tio_data;
+ int i;
+
+ if (lio->flags & LIO_FLAG_EVENTFD)
+ for (i = 0; i < n; ++i)
+ __io_set_eventfd(iocbs[i], lio->event_fd);
+}
+
+static void
+tapdisk_lio_ack_event(struct tqueue *queue)
+{
+ struct lio *lio = queue->tio_data;
+ uint64_t val;
+
+ if (lio->flags & LIO_FLAG_EVENTFD)
+ read_exact(lio->event_fd, &val, sizeof(val));
+}
+
+static void
+tapdisk_lio_event(event_id_t id, char mode, void *private)
+{
+ struct tqueue *queue = private;
+ struct lio *lio;
+ int i, ret, split;
+ struct iocb *iocb;
+ struct tiocb *tiocb;
+ struct io_event *ep;
+
+ tapdisk_lio_ack_event(queue);
+
+ lio = queue->tio_data;
+ ret = io_getevents(lio->aio_ctx, 0,
+ queue->size, lio->aio_events, NULL);
+ split = io_split(&queue->opioctx, lio->aio_events, ret);
+ tapdisk_filter_events(queue->filter, lio->aio_events, split);
+
+ DBG("events: %d, tiocbs: %d\n", ret, split);
+
+ queue->iocbs_pending -= ret;
+ queue->tiocbs_pending -= split;
+
+ for (i = split, ep = lio->aio_events; i-- > 0; ep++) {
+ iocb = ep->obj;
+ tiocb = iocb->data;
+ complete_tiocb(queue, tiocb, ep->res);
+ }
+
+ queue_deferred_tiocbs(queue);
+}
+
+static int
+tapdisk_lio_setup(struct tqueue *queue, int qlen)
+{
+ struct lio *lio = queue->tio_data;
+ size_t sz;
+ int err;
+
+ lio->event_id = -1;
+
+ err = tapdisk_lio_setup_aio(queue, qlen);
+ if (err)
+ goto fail;
+
+ lio->event_id =
+ tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ lio->event_fd, 0,
+ tapdisk_lio_event,
+ queue);
+ err = lio->event_id;
+ if (err < 0)
+ goto fail;
+
+ lio->aio_events = calloc(qlen, sizeof(struct io_event));
+ if (!lio->aio_events) {
+ err = -errno;
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ tapdisk_lio_destroy(queue);
+ return err;
+}
+
+static int
+tapdisk_lio_submit(struct tqueue *queue)
+{
+ struct lio *lio = queue->tio_data;
+ int merged, submitted, err = 0;
+
+ if (!queue->queued)
+ return 0;
+
+ tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued);
+ merged = io_merge(&queue->opioctx, queue->iocbs, queue->queued);
+ tapdisk_lio_set_eventfd(queue, merged, queue->iocbs);
+ submitted = io_submit(lio->aio_ctx, merged, queue->iocbs);
+
+ DBG("queued: %d, merged: %d, submitted: %d\n",
+ queue->queued, merged, submitted);
+
+ if (submitted < 0) {
+ err = submitted;
+ submitted = 0;
+ } else if (submitted < merged)
+ err = -EIO;
+
+ queue->iocbs_pending += submitted;
+ queue->tiocbs_pending += queue->queued;
+ queue->queued = 0;
+
+ if (err)
+ queue->tiocbs_pending -=
+ fail_tiocbs(queue, submitted, merged, err);
+
+ return submitted;
+}
+
+static const struct tio td_tio_lio = {
+ .name = "lio",
+ .data_size = sizeof(struct lio),
+ .tio_setup = tapdisk_lio_setup,
+ .tio_destroy = tapdisk_lio_destroy,
+ .tio_submit = tapdisk_lio_submit,
+};
+
+static void
+tapdisk_queue_free_io(struct tqueue *queue)
+{
+ if (queue->tio) {
+ if (queue->tio->tio_destroy)
+ queue->tio->tio_destroy(queue);
+ queue->tio = NULL;
+ }
+
+ if (queue->tio_data) {
+ free(queue->tio_data);
+ queue->tio_data = NULL;
+ }
+}
+
+static int
+tapdisk_queue_init_io(struct tqueue *queue, int drv)
+{
+ const struct tio *tio;
+ int err;
+
+ switch (drv) {
+ case TIO_DRV_LIO:
+ tio = &td_tio_lio;
+ break;
+ case TIO_DRV_RWIO:
+ tio = &td_tio_rwio;
+ break;
+ default:
+ err = -EINVAL;
+ goto fail;
+ }
+
+ queue->tio_data = calloc(1, tio->data_size);
+ if (!queue->tio_data) {
+ PERROR("malloc(%zu)", tio->data_size);
+ err = -errno;
+ goto fail;
+ }
+
+ queue->tio = tio;
+
+ if (tio->tio_setup) {
+ err = tio->tio_setup(queue, queue->size);
+ if (err)
+ goto fail;
+ }
+
+ DPRINTF("I/O queue driver: %s\n", tio->name);
+
+ return 0;
+
+fail:
+ tapdisk_queue_free_io(queue);
+ return err;
+}
+
+int
+tapdisk_init_queue(struct tqueue *queue, int size,
+ int drv, struct tfilter *filter)
+{
+ int i, err;
+
+ memset(queue, 0, sizeof(struct tqueue));
+
+ queue->size = size;
+ queue->filter = filter;
+
+ if (!size)
+ return 0;
+
+ err = tapdisk_queue_init_io(queue, drv);
+ if (err)
+ goto fail;
+
+ queue->iocbs = calloc(size, sizeof(struct iocb *));
+ if (!queue->iocbs) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = opio_init(&queue->opioctx, size);
+ if (err)
+ goto fail;
+
+ return 0;
+
+ fail:
+ tapdisk_free_queue(queue);
+ return err;
+}
+
+void
+tapdisk_free_queue(struct tqueue *queue)
+{
+ tapdisk_queue_free_io(queue);
+
+ free(queue->iocbs);
+ queue->iocbs = NULL;
+
+ opio_free(&queue->opioctx);
+}
+
+void
+tapdisk_debug_queue(struct tqueue *queue)
+{
+ struct tiocb *tiocb = queue->deferred.head;
+
+ WARN("TAPDISK QUEUE:\n");
+ WARN("size: %d, tio: %s, queued: %d, iocbs_pending: %d, "
+ "tiocbs_pending: %d, tiocbs_deferred: %d, deferrals: %"PRIx64"\n",
+ queue->size, queue->tio->name, queue->queued, queue->iocbs_pending,
+ queue->tiocbs_pending, queue->tiocbs_deferred, queue->deferrals);
+
+ if (tiocb) {
+ WARN("deferred:\n");
+ for (; tiocb != NULL; tiocb = tiocb->next) {
+ struct iocb *io = &tiocb->iocb;
+ WARN("%s of %lu bytes at %lld\n",
+ (io->aio_lio_opcode == IO_CMD_PWRITE ?
+ "write" : "read"),
+ io->u.c.nbytes, io->u.c.offset);
+ }
+ }
+}
+
+void
+tapdisk_prep_tiocb(struct tiocb *tiocb, int fd, int rw, char *buf, size_t size,
+ long long offset, td_queue_callback_t cb, void *arg)
+{
+ struct iocb *iocb = &tiocb->iocb;
+
+ if (rw)
+ io_prep_pwrite(iocb, fd, buf, size, offset);
+ else
+ io_prep_pread(iocb, fd, buf, size, offset);
+
+ iocb->data = tiocb;
+ tiocb->cb = cb;
+ tiocb->arg = arg;
+ tiocb->next = NULL;
+}
+
+void
+tapdisk_queue_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+ if (!tapdisk_queue_full(queue))
+ queue_tiocb(queue, tiocb);
+ else
+ defer_tiocb(queue, tiocb);
+}
+
+
+/*
+ * fail_tiocbs may queue more tiocbs
+ */
+int
+tapdisk_submit_tiocbs(struct tqueue *queue)
+{
+ return queue->tio->tio_submit(queue);
+}
+
+int
+tapdisk_submit_all_tiocbs(struct tqueue *queue)
+{
+ int submitted = 0;
+
+ do {
+ submitted += tapdisk_submit_tiocbs(queue);
+ } while (!tapdisk_queue_empty(queue));
+
+ return submitted;
+}
+
+/*
+ * cancel_tiocbs may queue more tiocbs
+ */
+int
+tapdisk_cancel_tiocbs(struct tqueue *queue)
+{
+ return cancel_tiocbs(queue, -EIO);
+}
+
+int
+tapdisk_cancel_all_tiocbs(struct tqueue *queue)
+{
+ int cancelled = 0;
+
+ do {
+ cancelled += tapdisk_cancel_tiocbs(queue);
+ } while (!tapdisk_queue_empty(queue));
+
+ return cancelled;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef TAPDISK_QUEUE_H
+#define TAPDISK_QUEUE_H
+
+#include <libaio.h>
+
+#include "io-optimize.h"
+#include "scheduler.h"
+
+struct tiocb;
+struct tfilter;
+
+typedef void (*td_queue_callback_t)(void *arg, struct tiocb *, int err);
+
+
+struct tiocb {
+ td_queue_callback_t cb;
+ void *arg;
+
+ struct iocb iocb;
+ struct tiocb *next;
+};
+
+struct tlist {
+ struct tiocb *head;
+ struct tiocb *tail;
+};
+
+struct tqueue {
+ int size;
+
+ const struct tio *tio;
+ void *tio_data;
+
+ struct opioctx opioctx;
+
+ int queued;
+ struct iocb **iocbs;
+
+ /* number of iocbs pending in the aio layer */
+ int iocbs_pending;
+
+ /* number of tiocbs pending in the queue --
+ * this is likely to be larger than iocbs_pending
+ * due to request coalescing */
+ int tiocbs_pending;
+
+ /* iocbs may be deferred if the aio ring is full.
+ * tapdisk_queue_complete will ensure deferred
+ * iocbs are queued as slots become available. */
+ struct tlist deferred;
+ int tiocbs_deferred;
+
+ /* optional tapdisk filter */
+ struct tfilter *filter;
+
+ uint64_t deferrals;
+};
+
+struct tio {
+ const char *name;
+ size_t data_size;
+
+ int (*tio_setup) (struct tqueue *queue, int qlen);
+ void (*tio_destroy) (struct tqueue *queue);
+ int (*tio_submit) (struct tqueue *queue);
+};
+
+enum {
+ TIO_DRV_LIO = 1,
+ TIO_DRV_RWIO = 2,
+};
+
+/*
+ * Interface for request producer (i.e., tapdisk)
+ * NB: the following functions may cause additional tiocbs to be queued:
+ * - tapdisk_submit_tiocbs
+ * - tapdisk_cancel_tiocbs
+ * - tapdisk_complete_tiocbs
+ * The *_all_tiocbs variants will handle the first two cases;
+ * be sure to call submit after calling complete in the third case.
+ */
+#define tapdisk_queue_count(q) ((q)->queued)
+#define tapdisk_queue_empty(q) ((q)->queued == 0)
+#define tapdisk_queue_full(q) \
+ (((q)->tiocbs_pending + (q)->queued) >= (q)->size)
+int tapdisk_init_queue(struct tqueue *, int size, int drv, struct tfilter *);
+void tapdisk_free_queue(struct tqueue *);
+void tapdisk_debug_queue(struct tqueue *);
+void tapdisk_queue_tiocb(struct tqueue *, struct tiocb *);
+int tapdisk_submit_tiocbs(struct tqueue *);
+int tapdisk_submit_all_tiocbs(struct tqueue *);
+int tapdisk_cancel_tiocbs(struct tqueue *);
+int tapdisk_cancel_all_tiocbs(struct tqueue *);
+void tapdisk_prep_tiocb(struct tiocb *, int, int, char *, size_t,
+ long long, td_queue_callback_t, void *);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+
+#include "tapdisk-ring.h"
+
+static int
+tapdisk_uring_create_ctlfd(td_uring_t *ring)
+{
+ int fd, err;
+ struct sockaddr_un saddr;
+
+ if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_family)) >=
+ sizeof(saddr.sun_family))
+ return -ENAMETOOLONG;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd == -1)
+ return -errno;
+
+ memset(&saddr, 0, sizeof(struct sockaddr_un));
+ saddr.sun_family = AF_UNIX;
+ memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path));
+
+ err = unlink(ring->ctlfd_path);
+ if (err == -1 && errno != ENOENT) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = bind(fd, &saddr, sizeof(struct sockaddr_un));
+ if (err == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = listen(fd, 1);
+ if (err == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ ring->ctlfd = fd;
+ return 0;
+
+fail:
+ close(fd);
+ return err;
+}
+
+static void
+tapdisk_uring_destroy_ctlfd(td_uring_t *ring)
+{
+ if (ring->ctlfd) {
+ close(ring->ctlfd);
+ ring->ctlfd = 0;
+ }
+
+ if (ring->ctlfd_path) {
+ unlink(ring->ctlfd_path);
+ free(ring->ctlfd_path);
+ ring->ctlfd_path = NULL;
+ }
+}
+
+static int
+tapdisk_uring_connect_ctlfd(td_uring_t *ring)
+{
+ int fd, err;
+ struct sockaddr_un saddr;
+
+ if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_path)) >=
+ sizeof(saddr.sun_path))
+ return -ENAMETOOLONG;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd == -1)
+ return -errno;
+
+ memset(&saddr, 0, sizeof(struct sockaddr_un));
+ saddr.sun_family = AF_UNIX;
+ memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path));
+
+ err = connect(fd, &saddr, sizeof(saddr));
+ if (err == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ ring->ctlfd = fd;
+ return 0;
+
+fail:
+ close(fd);
+ return err;
+}
+
+static void
+tapdisk_uring_disconnect_ctlfd(td_uring_t *ring)
+{
+ if (ring->ctlfd)
+ close(ring->ctlfd);
+ free(ring->ctlfd_path);
+ ring->ctlfd_path = NULL;
+}
+
+static int
+tapdisk_uring_create_shmem(td_uring_t *ring)
+{
+ int fd, err;
+
+ fd = shm_open(ring->shmem_path, O_CREAT | O_RDWR, 0750);
+ if (fd == -1)
+ return -errno;
+
+ err = ftruncate(fd, ring->shmem_size);
+ if (err == -1) {
+ err = -errno;
+ goto out;
+ }
+
+ ring->shmem = mmap(NULL, ring->shmem_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (ring->shmem == MAP_FAILED) {
+ ring->shmem = NULL;
+ err = -errno;
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ close(fd);
+ return err;
+}
+
+static void
+tapdisk_uring_destroy_shmem(td_uring_t *ring)
+{
+ if (ring->shmem) {
+ munmap(ring->shmem, ring->shmem_size);
+ ring->shmem = NULL;
+ }
+
+ if (ring->shmem_path) {
+ shm_unlink(ring->shmem_path);
+ free(ring->shmem_path);
+ ring->shmem_path = NULL;
+ }
+}
+
+static int
+tapdisk_uring_connect_shmem(td_uring_t *ring)
+{
+ int fd, err;
+ td_uring_header_t header, *p;
+
+ fd = shm_open(ring->shmem_path, O_RDWR);
+ if (fd == -1)
+ return -errno;
+
+ p = mmap(NULL, sizeof(td_uring_header_t),
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (p == MAP_FAILED) {
+ err = -errno;
+ goto out;
+ }
+
+ memcpy(&header, p, sizeof(td_uring_header_t));
+ munmap(p, sizeof(td_uring_header_t));
+
+ if (memcmp(header.cookie,
+ TAPDISK_URING_COOKIE, sizeof(header.cookie))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (header.version != TD_URING_CURRENT_VERSION) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ring->ring_size = header.ring_size;
+ ring->data_size = header.data_size;
+ ring->shmem_size = header.shmem_size;
+
+ ring->shmem = mmap(NULL, ring->shmem_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (ring->shmem == MAP_FAILED) {
+ rint->shmem = NULL;
+ err = -errno;
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ close(fd);
+ return err;
+}
+
+static void
+tapdisk_uring_disconnect_shmem(td_uring_t *ring)
+{
+ if (ring->shmem)
+ munmap(ring->shmem, ring->shmem_size);
+ free(ring->shmem_path);
+ ring->shmem_path = NULL;
+}
+
+int
+tapdisk_uring_create(td_uring_t *ring, const char *location,
+ uint32_t ring_size, uint32_t data_size)
+{
+ int fd, err;
+
+ memset(ring, 0, sizeof(td_uring_t));
+
+ ring->ring_size = ring_size;
+ ring->data_size = data_size;
+ ring->shmem_size = ring_size + data_size + sizeof(td_uring_header_t);
+
+ err = asprintf(&ring->shmem_path, "%s.shm", location);
+ if (err == -1) {
+ ring->shmem_path = NULL;
+ err = -errno;
+ goto fail;
+ }
+
+ err = asprintf(&ring->ctlfd_path, "%s.cfd", location);
+ if (err == -1) {
+ ring->ctlfd_path = NULL;
+ err = -errno;
+ goto fail;
+ }
+
+ err = tapdisk_uring_create_ctlfd(ring);
+ if (err)
+ goto fail;
+
+ err = tapdisk_uring_create_shmem(ring);
+ if (err)
+ goto fail;
+
+ ring->ring_area = (unsigned long)ring->shmem + sizeof(td_uring_header_t);
+ ring->data_area = (unsigned long)ring->ring_area + ring->ring_size;
+
+ return 0;
+
+fail:
+ tapdisk_uring_destroy(ring);
+ return err;
+}
+
+int
+tapdisk_uring_destroy(td_uring_t *ring)
+{
+ tapdisk_uring_destroy_shmem(ring);
+ tapdisk_uring_destroy_ctlfd(ring);
+ return 0;
+}
+
+int
+tapdisk_uring_connect(td_uring_t *ring, const char *location)
+{
+ int fd, err;
+
+ memset(ring, 0, sizeof(td_uring_t));
+
+ err = asprintf(&ring->shmem_path, "%s.shm", location);
+ if (err == -1) {
+ ring->shmem_path = NULL;
+ err = -errno;
+ goto fail;
+ }
+
+ err = asprintf(&ring->ctlfd_path, "%s.cfd", location);
+ if (err == -1) {
+ ring->ctlfd_path = NULL;
+ err = -errno;
+ goto fail;
+ }
+
+ err = tapdisk_uring_connect_ctlfd(ring);
+ if (err)
+ goto fail;
+
+ err = tapdisk_uring_connect_shmem(ring);
+ if (err)
+ goto fail;
+
+ err = 0;
+
+fail:
+}
+
+int
+tapdisk_uring_disconnect(td_uring_t *ring)
+{
+ tapdisk_uring_disconnect_shmem(ring);
+ tapdisk_uring_disconnect_ctlfd(ring);
+ return 0;
+}
+
+static int
+tapdisk_ring_read_message(int fd, td_uring_message_t *message, int timeout)
+{
+ fd_set readfds;
+ int ret, len, offset;
+ struct timeval tv, *t;
+
+ t = NULL;
+ offset = 0;
+ len = sizeof(td_uring_message_t);
+
+ if (timeout) {
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ t = &tv;
+ }
+
+ while (offset < len) {
+ FD_ZERO(&readfds);
+ FD_SET(fd, &readfds);
+
+ /* we don't bother reinitializing tv. at worst, it will wait a
+ * bit more time than expected. */
+
+ ret = select(fd + 1, &readfds, NULL, NULL, t);
+ if (ret == -1)
+ break;
+ else if (FD_ISSET(fd, &readfds)) {
+ ret = read(fd, message + offset, len - offset);
+ if (ret <= 0)
+ break;
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len)
+ return -EIO;
+
+ return 0;
+}
+
+static int
+tapdisk_ring_write_message(int fd, td_uring_message_t *message, int timeout)
+{
+ fd_set writefds;
+ int ret, len, offset;
+ struct timeval tv, *t;
+
+ t = NULL;
+ offset = 0;
+ len = sizeof(td_uring_message_t);
+
+ if (timeout) {
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ t = &tv;
+ }
+
+ while (offset < len) {
+ FD_ZERO(&writefds);
+ FD_SET(fd, &writefds);
+
+ /* we don't bother reinitializing tv. at worst, it will wait a
+ * bit more time than expected. */
+
+ ret = select(fd + 1, NULL, &writefds, NULL, t);
+ if (ret == -1)
+ break;
+ else if (FD_ISSET(fd, &writefds)) {
+ ret = write(fd, message + offset, len - offset);
+ if (ret <= 0)
+ break;
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len)
+ return -EIO;
+
+ return 0;
+}
+
+int
+tapdisk_uring_poll(td_uring_t *ring)
+{
+ int err;
+ td_uring_message_t message;
+
+ err = tapdisk_uring_read_message(ring->ctlfd, &message, 1);
+ if (err)
+ return err;
+
+ if (message.type != TAPDISK_URING_MESSAGE_KICK)
+ return -EINVAL;
+
+ return 0;
+}
+
+int
+tapdisk_uring_kick(td_uring_t *ring)
+{
+ td_uring_message_t message;
+
+ memset(&message, 0, sizeof(td_uring_message_t));
+ message.type = TAPDISK_URING_MESSAGE_KICK;
+
+ return tapdisk_uring_write_message(ring->ctlfd, &message, 1);
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_RING_H_
+#define _TAPDISK_RING_H_
+
+#include <inttypes.h>
+
+#include <xenctrl.h>
+#include <xen/io/ring.h>
+
+typedef struct td_uring td_uring_t;
+typedef struct td_uring_header td_uring_header_t;
+typedef struct td_uring_request td_uring_request_t;
+typedef struct td_uring_response td_uring_response_t;
+
+struct td_uring {
+ int ctlfd;
+
+ char *shmem_path;
+ char *ctlfd_path;
+
+ void *shmem;
+ void *ring_area;
+ void *data_area;
+};
+
+struct td_uring_header {
+ char cookie[8];
+ uint32_t version;
+ uint32_t shmem_size;
+ uint32_t ring_size;
+ uint32_t data_size;
+ char reserved[4064];
+};
+
+struct td_uring_request {
+ uint8_t op;
+ uint64_t id;
+ uint64_t sec;
+ uint32_t secs;
+ uint32_t offset;
+};
+
+struct td_uring_response {
+ uint8_t op;
+ uint64_t id;
+ uint8_t status;
+};
+
+DEFINE_RING_TYPES(td_uring, td_uring_request_t, td_uring_response_t);
+
+int tapdisk_uring_create(td_uring_t *, const char *location,
+ uint32_t ring_size, uint32_t data_size);
+int tapdisk_uring_destroy(td_uring_t *);
+
+int tapdisk_uring_connect(td_uring_t *, const char *location);
+int tapdisk_uring_disconnect(td_uring_t *);
+
+int tapdisk_uring_poll(td_uring_t *);
+int tapdisk_uring_kick(td_uring_t *);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/signal.h>
+
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a)
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+ tapdisk_server_t server;
+
+#define tapdisk_server_for_each_vbd(vbd, tmp) \
+ list_for_each_entry_safe(vbd, tmp, &server.vbds, next)
+
+td_image_t *
+tapdisk_server_get_shared_image(td_image_t *image)
+{
+ td_vbd_t *vbd, *tmpv;
+ td_image_t *img, *tmpi;
+
+ if (!td_flag_test(image->flags, TD_OPEN_SHAREABLE))
+ return NULL;
+
+ tapdisk_server_for_each_vbd(vbd, tmpv)
+ tapdisk_vbd_for_each_image(vbd, img, tmpi)
+ if (img->type == image->type &&
+ !strcmp(img->name, image->name))
+ return img;
+
+ return NULL;
+}
+
+struct list_head *
+tapdisk_server_get_all_vbds(void)
+{
+ return &server.vbds;
+}
+
+td_vbd_t *
+tapdisk_server_get_vbd(uint16_t uuid)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ if (vbd->uuid == uuid)
+ return vbd;
+
+ return NULL;
+}
+
+void
+tapdisk_server_add_vbd(td_vbd_t *vbd)
+{
+ list_add_tail(&vbd->next, &server.vbds);
+}
+
+void
+tapdisk_server_remove_vbd(td_vbd_t *vbd)
+{
+ list_del(&vbd->next);
+ INIT_LIST_HEAD(&vbd->next);
+ tapdisk_server_check_state();
+}
+
+void
+tapdisk_server_queue_tiocb(struct tiocb *tiocb)
+{
+ tapdisk_queue_tiocb(&server.aio_queue, tiocb);
+}
+
+void
+tapdisk_server_debug(void)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_debug_queue(&server.aio_queue);
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_debug(vbd);
+
+ tlog_flush();
+}
+
+void
+tapdisk_server_check_state(void)
+{
+ if (list_empty(&server.vbds))
+ server.run = 0;
+}
+
+event_id_t
+tapdisk_server_register_event(char mode, int fd,
+ int timeout, event_cb_t cb, void *data)
+{
+ return scheduler_register_event(&server.scheduler,
+ mode, fd, timeout, cb, data);
+}
+
+void
+tapdisk_server_unregister_event(event_id_t event)
+{
+ return scheduler_unregister_event(&server.scheduler, event);
+}
+
+void
+tapdisk_server_set_max_timeout(int seconds)
+{
+ scheduler_set_max_timeout(&server.scheduler, seconds);
+}
+
+static void
+tapdisk_server_assert_locks(void)
+{
+
+}
+
+static void
+tapdisk_server_set_retry_timeout(void)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ if (tapdisk_vbd_retry_needed(vbd)) {
+ tapdisk_server_set_max_timeout(TD_VBD_RETRY_INTERVAL);
+ return;
+ }
+}
+
+static void
+tapdisk_server_check_progress(void)
+{
+ struct timeval now;
+ td_vbd_t *vbd, *tmp;
+
+ gettimeofday(&now, NULL);
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_check_progress(vbd);
+}
+
+static void
+tapdisk_server_submit_tiocbs(void)
+{
+ tapdisk_submit_all_tiocbs(&server.aio_queue);
+}
+
+static void
+tapdisk_server_kick_responses(void)
+{
+ int n;
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_kick(vbd);
+}
+
+static void
+tapdisk_server_check_vbds(void)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_check_state(vbd);
+}
+
+static void
+tapdisk_server_stop_vbds(void)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_kill_queue(vbd);
+}
+
+static int
+tapdisk_server_init_aio(void)
+{
+ return tapdisk_init_queue(&server.aio_queue, TAPDISK_TIOCBS,
+ TIO_DRV_LIO, NULL);
+}
+
+static void
+tapdisk_server_close_aio(void)
+{
+ tapdisk_free_queue(&server.aio_queue);
+}
+
+static void
+tapdisk_server_close(void)
+{
+ tapdisk_server_close_aio();
+}
+
+void
+tapdisk_server_iterate(void)
+{
+ int ret;
+
+ tapdisk_server_assert_locks();
+ tapdisk_server_set_retry_timeout();
+ tapdisk_server_check_progress();
+
+ ret = scheduler_wait_for_events(&server.scheduler);
+ if (ret < 0)
+ DBG(TLOG_WARN, "server wait returned %d\n", ret);
+
+ tapdisk_server_check_vbds();
+ tapdisk_server_submit_tiocbs();
+ tapdisk_server_kick_responses();
+}
+
+static void
+__tapdisk_server_run(void)
+{
+ while (server.run)
+ tapdisk_server_iterate();
+}
+
+static void
+tapdisk_server_signal_handler(int signal)
+{
+ td_vbd_t *vbd, *tmp;
+ static int xfsz_error_sent = 0;
+
+ switch (signal) {
+ case SIGBUS:
+ case SIGINT:
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_close(vbd);
+ break;
+
+ case SIGXFSZ:
+ ERR(EFBIG, "received SIGXFSZ");
+ tapdisk_server_stop_vbds();
+ if (xfsz_error_sent)
+ break;
+
+ xfsz_error_sent = 1;
+ break;
+
+ case SIGUSR1:
+ tapdisk_server_debug();
+ break;
+ }
+}
+
+int
+tapdisk_server_init(void)
+{
+ memset(&server, 0, sizeof(server));
+ INIT_LIST_HEAD(&server.vbds);
+
+ scheduler_initialize(&server.scheduler);
+
+ return 0;
+}
+
+int
+tapdisk_server_complete(void)
+{
+ int err;
+
+ err = tapdisk_server_init_aio();
+ if (err)
+ goto fail;
+
+ server.run = 1;
+
+ return 0;
+
+fail:
+ tapdisk_server_close_aio();
+ return err;
+}
+
+int
+tapdisk_server_initialize(void)
+{
+ int err;
+
+ tapdisk_server_init();
+
+ err = tapdisk_server_complete();
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ tapdisk_server_close();
+ return err;
+}
+
+int
+tapdisk_server_run()
+{
+ int err;
+
+ err = tapdisk_set_resource_limits();
+ if (err)
+ return err;
+
+ signal(SIGBUS, tapdisk_server_signal_handler);
+ signal(SIGINT, tapdisk_server_signal_handler);
+ signal(SIGUSR1, tapdisk_server_signal_handler);
+ signal(SIGXFSZ, tapdisk_server_signal_handler);
+
+ __tapdisk_server_run();
+ tapdisk_server_close();
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_SERVER_H_
+#define _TAPDISK_SERVER_H_
+
+#include "list.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-queue.h"
+
+struct tap_disk *tapdisk_server_find_driver_interface(int);
+
+td_image_t *tapdisk_server_get_shared_image(td_image_t *);
+
+struct list_head *tapdisk_server_get_all_vbds(void);
+td_vbd_t *tapdisk_server_get_vbd(td_uuid_t);
+void tapdisk_server_add_vbd(td_vbd_t *);
+void tapdisk_server_remove_vbd(td_vbd_t *);
+
+void tapdisk_server_queue_tiocb(struct tiocb *);
+
+void tapdisk_server_check_state(void);
+
+event_id_t tapdisk_server_register_event(char, int, int, event_cb_t, void *);
+void tapdisk_server_unregister_event(event_id_t);
+void tapdisk_server_set_max_timeout(int);
+
+int tapdisk_server_init(void);
+int tapdisk_server_initialize(void);
+int tapdisk_server_complete(void);
+int tapdisk_server_run(void);
+void tapdisk_server_iterate(void);
+
+#define TAPDISK_TIOCBS (TAPDISK_DATA_REQUESTS + 50)
+
+typedef struct tapdisk_server {
+ int run;
+ struct list_head vbds;
+ scheduler_t scheduler;
+ struct tqueue aio_queue;
+} tapdisk_server_t;
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "list.h"
+#include "scheduler.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+#include "tapdisk-disktype.h"
+#include "tapdisk-utils.h"
+
+#define POLL_READ 0
+#define POLL_WRITE 1
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+struct tapdisk_stream_poll {
+ int pipe[2];
+ int set;
+};
+
+struct tapdisk_stream_request {
+ uint64_t sec;
+ uint32_t secs;
+ uint64_t seqno;
+ blkif_request_t blkif_req;
+ struct list_head next;
+};
+
+struct tapdisk_stream {
+ td_vbd_t *vbd;
+
+ unsigned int id;
+ int in_fd;
+ int out_fd;
+
+ int err;
+
+ uint64_t cur;
+ uint64_t start;
+ uint64_t end;
+
+ uint64_t started;
+ uint64_t completed;
+
+ struct tapdisk_stream_poll poll;
+ event_id_t enqueue_event_id;
+
+ struct list_head free_list;
+ struct list_head pending_list;
+ struct list_head completed_list;
+
+ struct tapdisk_stream_request requests[MAX_REQUESTS];
+};
+
+static unsigned int tapdisk_stream_count;
+
+static void tapdisk_stream_close_image(struct tapdisk_stream *);
+
+static void
+usage(const char *app, int err)
+{
+ printf("usage: %s <-n type:/path/to/image> "
+ "[-c sector count] [-s skip sectors]\n", app);
+ exit(err);
+}
+
+static inline void
+tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p)
+{
+ p->set = 0;
+ p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1;
+}
+
+static int
+tapdisk_stream_poll_open(struct tapdisk_stream_poll *p)
+{
+ int err;
+
+ tapdisk_stream_poll_initialize(p);
+
+ err = pipe(p->pipe);
+ if (err)
+ return -errno;
+
+ err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK);
+ if (err)
+ goto out;
+
+ err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK);
+ if (err)
+ goto out;
+
+ return 0;
+
+out:
+ close(p->pipe[POLL_READ]);
+ close(p->pipe[POLL_WRITE]);
+ tapdisk_stream_poll_initialize(p);
+ return -errno;
+}
+
+static void
+tapdisk_stream_poll_close(struct tapdisk_stream_poll *p)
+{
+ if (p->pipe[POLL_READ] != -1)
+ close(p->pipe[POLL_READ]);
+ if (p->pipe[POLL_WRITE] != -1)
+ close(p->pipe[POLL_WRITE]);
+ tapdisk_stream_poll_initialize(p);
+}
+
+static inline void
+tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p)
+{
+ int dummy;
+
+ read_exact(p->pipe[POLL_READ], &dummy, sizeof(dummy));
+ p->set = 0;
+}
+
+static inline void
+tapdisk_stream_poll_set(struct tapdisk_stream_poll *p)
+{
+ int dummy = 0;
+
+ if (!p->set) {
+ write_exact(p->pipe[POLL_WRITE], &dummy, sizeof(dummy));
+ p->set = 1;
+ }
+}
+
+static inline int
+tapdisk_stream_stop(struct tapdisk_stream *s)
+{
+ return (list_empty(&s->pending_list) && (s->cur == s->end || s->err));
+}
+
+static inline void
+tapdisk_stream_initialize_request(struct tapdisk_stream_request *req)
+{
+ memset(req, 0, sizeof(*req));
+ INIT_LIST_HEAD(&req->next);
+}
+
+static inline int
+tapdisk_stream_request_idx(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *req)
+{
+ return (req - s->requests);
+}
+
+static inline struct tapdisk_stream_request *
+tapdisk_stream_get_request(struct tapdisk_stream *s)
+{
+ struct tapdisk_stream_request *req;
+
+ if (list_empty(&s->free_list))
+ return NULL;
+
+ req = list_entry(s->free_list.next,
+ struct tapdisk_stream_request, next);
+
+ list_del_init(&req->next);
+ tapdisk_stream_initialize_request(req);
+
+ return req;
+}
+
+static void
+tapdisk_stream_print_request(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *sreq)
+{
+ unsigned long idx = (unsigned long)tapdisk_stream_request_idx(s, sreq);
+ char *buf = (char *)MMAP_VADDR(s->vbd->ring.vstart, idx, 0);
+ write_exact(s->out_fd, buf, sreq->secs << SECTOR_SHIFT);
+}
+
+static void
+tapdisk_stream_write_data(struct tapdisk_stream *s)
+{
+ struct tapdisk_stream_request *sreq, *tmp;
+
+ list_for_each_entry_safe(sreq, tmp, &s->completed_list, next) {
+ if (sreq->seqno != s->completed)
+ break;
+
+ s->completed++;
+ tapdisk_stream_print_request(s, sreq);
+
+ list_del_init(&sreq->next);
+ list_add_tail(&sreq->next, &s->free_list);
+ }
+}
+
+static inline void
+tapdisk_stream_queue_completed(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *sreq)
+{
+ struct tapdisk_stream_request *itr;
+
+ list_for_each_entry(itr, &s->completed_list, next)
+ if (sreq->seqno < itr->seqno) {
+ list_add_tail(&sreq->next, &itr->next);
+ return;
+ }
+
+ list_add_tail(&sreq->next, &s->completed_list);
+}
+
+static void
+tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp)
+{
+ struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+ struct tapdisk_stream_request *sreq = s->requests + rsp->id;
+
+ list_del_init(&sreq->next);
+
+ if (rsp->status == BLKIF_RSP_OKAY)
+ tapdisk_stream_queue_completed(s, sreq);
+ else {
+ s->err = EIO;
+ list_add_tail(&sreq->next, &s->free_list);
+ fprintf(stderr, "error reading sector 0x%"PRIu64"\n", sreq->sec);
+ }
+
+ tapdisk_stream_write_data(s);
+ tapdisk_stream_poll_set(&s->poll);
+}
+
+static void
+tapdisk_stream_enqueue(event_id_t id, char mode, void *arg)
+{
+ td_vbd_t *vbd;
+ int i, idx, psize;
+ struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+
+ vbd = s->vbd;
+ tapdisk_stream_poll_clear(&s->poll);
+
+ if (tapdisk_stream_stop(s)) {
+ tapdisk_stream_close_image(s);
+ return;
+ }
+
+ psize = getpagesize();
+
+ while (s->cur < s->end && !s->err) {
+ blkif_request_t *breq;
+ td_vbd_request_t *vreq;
+ struct tapdisk_stream_request *sreq;
+
+ sreq = tapdisk_stream_get_request(s);
+ if (!sreq)
+ break;
+
+ idx = tapdisk_stream_request_idx(s, sreq);
+
+ sreq->sec = s->cur;
+ sreq->secs = 0;
+ sreq->seqno = s->started++;
+
+ breq = &sreq->blkif_req;
+ breq->id = idx;
+ breq->nr_segments = 0;
+ breq->sector_number = sreq->sec;
+ breq->operation = BLKIF_OP_READ;
+
+ for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) {
+ uint32_t secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT);
+ struct blkif_request_segment *seg = breq->seg + i;
+
+ if (!secs)
+ break;
+
+ sreq->secs += secs;
+ s->cur += secs;
+
+ seg->first_sect = 0;
+ seg->last_sect = secs - 1;
+ breq->nr_segments++;
+ }
+
+ vreq = vbd->request_list + idx;
+
+ assert(list_empty(&vreq->next));
+ assert(vreq->secs_pending == 0);
+
+ memcpy(&vreq->req, breq, sizeof(*breq));
+ vbd->received++;
+ vreq->vbd = vbd;
+
+ tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+ list_add_tail(&sreq->next, &s->pending_list);
+ }
+
+ tapdisk_vbd_issue_requests(vbd);
+}
+
+static int
+tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type)
+{
+ int err;
+
+ s->id = tapdisk_stream_count++;
+
+ err = tapdisk_server_initialize();
+ if (err)
+ goto out;
+
+ err = tapdisk_vbd_initialize(s->id);
+ if (err)
+ goto out;
+
+ s->vbd = tapdisk_server_get_vbd(s->id);
+ if (!s->vbd) {
+ err = ENODEV;
+ goto out;
+ }
+
+ tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s);
+
+ err = tapdisk_vbd_open_vdi(s->vbd, path, type,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ TD_OPEN_RDONLY);
+ if (err)
+ goto out;
+
+ s->vbd->reopened = 1;
+ err = 0;
+
+out:
+ if (err)
+ fprintf(stderr, "failed to open %s: %d\n", path, err);
+ return err;
+}
+
+static void
+tapdisk_stream_close_image(struct tapdisk_stream *s)
+{
+ td_vbd_t *vbd;
+
+ vbd = tapdisk_server_get_vbd(s->id);
+ if (vbd) {
+ tapdisk_vbd_close_vdi(vbd);
+ tapdisk_server_remove_vbd(vbd);
+ free((void *)vbd->ring.vstart);
+ free(vbd->name);
+ free(vbd);
+ s->vbd = NULL;
+ }
+}
+
+static int
+tapdisk_stream_set_position(struct tapdisk_stream *s,
+ uint64_t count, uint64_t skip)
+{
+ int err;
+ image_t image;
+
+ err = tapdisk_vbd_get_image_info(s->vbd, &image);
+ if (err) {
+ fprintf(stderr, "failed getting image size: %d\n", err);
+ return err;
+ }
+
+ if (count == (uint64_t)-1)
+ count = image.size - skip;
+
+ if (count + skip > image.size) {
+ fprintf(stderr, "0x%"PRIx64" past end of image 0x%"PRIx64"\n",
+ (uint64_t) (count + skip), (uint64_t) image.size);
+ return -EINVAL;
+ }
+
+ s->start = skip;
+ s->cur = s->start;
+ s->end = s->start + count;
+
+ return 0;
+}
+
+static int
+tapdisk_stream_initialize_requests(struct tapdisk_stream *s)
+{
+ size_t size;
+ td_ring_t *ring;
+ int err, i, psize;
+
+ ring = &s->vbd->ring;
+ psize = getpagesize();
+ size = psize * BLKTAP_MMAP_REGION_SIZE;
+
+ /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */
+ err = posix_memalign((void **)&ring->vstart, psize, size);
+ if (err) {
+ fprintf(stderr, "failed to allocate buffers: %d\n", err);
+ ring->vstart = 0;
+ return err;
+ }
+
+ for (i = 0; i < MAX_REQUESTS; i++) {
+ struct tapdisk_stream_request *req = s->requests + i;
+ tapdisk_stream_initialize_request(req);
+ list_add_tail(&req->next, &s->free_list);
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s)
+{
+ int err;
+ struct tapdisk_stream_poll *p = &s->poll;
+
+ err = tapdisk_stream_poll_open(p);
+ if (err)
+ goto out;
+
+ err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ p->pipe[POLL_READ], 0,
+ tapdisk_stream_enqueue, s);
+ if (err < 0)
+ goto out;
+
+ s->enqueue_event_id = err;
+ err = 0;
+
+out:
+ if (err)
+ fprintf(stderr, "failed to register event: %d\n", err);
+ return err;
+}
+
+static void
+tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s)
+{
+ if (s->enqueue_event_id) {
+ tapdisk_server_unregister_event(s->enqueue_event_id);
+ s->enqueue_event_id = 0;
+ }
+ tapdisk_stream_poll_close(&s->poll);
+}
+
+static inline void
+tapdisk_stream_initialize(struct tapdisk_stream *s)
+{
+ memset(s, 0, sizeof(*s));
+ s->in_fd = s->out_fd = -1;
+ INIT_LIST_HEAD(&s->free_list);
+ INIT_LIST_HEAD(&s->pending_list);
+ INIT_LIST_HEAD(&s->completed_list);
+}
+
+static int
+tapdisk_stream_open_fds(struct tapdisk_stream *s)
+{
+ s->out_fd = dup(STDOUT_FILENO);
+ if (s->out_fd == -1) {
+ fprintf(stderr, "failed to open output: %d\n", errno);
+ return errno;
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_stream_open(struct tapdisk_stream *s, const char *path,
+ int type, uint64_t count, uint64_t skip)
+{
+ int err;
+
+ tapdisk_stream_initialize(s);
+
+ err = tapdisk_stream_open_fds(s);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_open_image(s, path, type);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_set_position(s, count, skip);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_initialize_requests(s);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_register_enqueue_event(s);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static void
+tapdisk_stream_release(struct tapdisk_stream *s)
+{
+ close(s->out_fd);
+ tapdisk_stream_close_image(s);
+ tapdisk_stream_unregister_enqueue_event(s);
+}
+
+static int
+tapdisk_stream_run(struct tapdisk_stream *s)
+{
+ tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s);
+ tapdisk_server_run();
+ return s->err;
+}
+
+int
+main(int argc, char *argv[])
+{
+ int c, err, type;
+ const char *params;
+ const disk_info_t *info;
+ const char *path;
+ uint64_t count, skip;
+ struct tapdisk_stream stream;
+
+ err = 0;
+ skip = 0;
+ count = (uint64_t)-1;
+ params = NULL;
+
+ while ((c = getopt(argc, argv, "n:c:s:h")) != -1) {
+ switch (c) {
+ case 'n':
+ params = optarg;
+ break;
+ case 'c':
+ count = strtoull(optarg, NULL, 10);
+ break;
+ case 's':
+ skip = strtoull(optarg, NULL, 10);
+ break;
+ default:
+ err = EINVAL;
+ case 'h':
+ usage(argv[0], err);
+ }
+ }
+
+ if (!params)
+ usage(argv[0], EINVAL);
+
+ type = tapdisk_disktype_parse_params(params, &path);
+ if (type < 0) {
+ err = type;
+ fprintf(stderr, "invalid argument %s: %d\n", params, err);
+ return err;
+ }
+
+ tapdisk_start_logging("tapdisk-stream");
+
+ err = tapdisk_stream_open(&stream, path, type, count, skip);
+ if (err)
+ goto out;
+
+ err = tapdisk_stream_run(&stream);
+ if (err)
+ goto out;
+
+ err = 0;
+
+out:
+ tapdisk_stream_release(&stream);
+ tapdisk_stop_logging();
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/utsname.h>
+#ifdef __linux__
+#include <linux/version.h>
+#endif
+
+#include "blk.h"
+#include "tapdisk.h"
+#include "blktaplib.h"
+#include "tapdisk-log.h"
+#include "tapdisk-utils.h"
+
+void
+tapdisk_start_logging(const char *name)
+{
+ static char buf[128];
+
+ snprintf(buf, sizeof(buf), "%s[%d]", name, getpid());
+ openlog(buf, LOG_CONS | LOG_ODELAY, LOG_DAEMON);
+ open_tlog("/tmp/tapdisk.log", (64 << 10), TLOG_WARN, 0);
+}
+
+void
+tapdisk_stop_logging(void)
+{
+ closelog();
+ close_tlog();
+}
+
+int
+tapdisk_set_resource_limits(void)
+{
+ int err;
+ struct rlimit rlim;
+
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+
+ err = setrlimit(RLIMIT_MEMLOCK, &rlim);
+ if (err == -1) {
+ EPRINTF("RLIMIT_MEMLOCK failed: %d\n", errno);
+ return -errno;
+ }
+
+ err = mlockall(MCL_CURRENT | MCL_FUTURE);
+ if (err == -1) {
+ EPRINTF("mlockall failed: %d\n", errno);
+ return -errno;
+ }
+
+#define CORE_DUMP
+#if defined(CORE_DUMP)
+ err = setrlimit(RLIMIT_CORE, &rlim);
+ if (err == -1)
+ EPRINTF("RLIMIT_CORE failed: %d\n", errno);
+#endif
+
+ return 0;
+}
+
+int
+tapdisk_namedup(char **dup, const char *name)
+{
+ *dup = NULL;
+
+ if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ *dup = strdup(name);
+ if (!*dup)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*Get Image size, secsize*/
+int
+tapdisk_get_image_size(int fd, uint64_t *_sectors, uint32_t *_sector_size)
+{
+ int ret;
+ struct stat stat;
+ uint64_t sectors;
+ uint64_t sector_size;
+
+ sectors = 0;
+ sector_size = 0;
+ *_sectors = 0;
+ *_sector_size = 0;
+
+ if (fstat(fd, &stat)) {
+ DPRINTF("ERROR: fstat failed, Couldn't stat image");
+ return -EINVAL;
+ }
+
+ if (S_ISBLK(stat.st_mode)) {
+ /*Accessing block device directly*/
+ if (blk_getimagesize(fd, §ors) != 0)
+ return -EINVAL;
+
+ /*Get the sector size*/
+ if (blk_getsectorsize(fd, §or_size) != 0)
+ sector_size = DEFAULT_SECTOR_SIZE;
+ } else {
+ /*Local file? try fstat instead*/
+ sectors = (stat.st_size >> SECTOR_SHIFT);
+ sector_size = DEFAULT_SECTOR_SIZE;
+ }
+
+ if (sectors == 0) {
+ sectors = 16836057ULL;
+ sector_size = DEFAULT_SECTOR_SIZE;
+ }
+
+ return 0;
+}
+
+#ifdef __linux__
+
+int tapdisk_linux_version(void)
+{
+ struct utsname uts;
+ unsigned int version, patchlevel, sublevel;
+ int n, err;
+
+ err = uname(&uts);
+ if (err)
+ return -errno;
+
+ n = sscanf(uts.release, "%u.%u.%u", &version, &patchlevel, &sublevel);
+ if (n != 3)
+ return -ENOSYS;
+
+ return KERNEL_VERSION(version, patchlevel, sublevel);
+}
+
+#else
+
+int tapdisk_linux_version(void)
+{
+ return -ENOSYS;
+}
+
+#endif
+int read_exact(int fd, void *data, size_t size)
+{
+ size_t offset = 0;
+ ssize_t len;
+
+ while ( offset < size )
+ {
+ len = read(fd, (char *)data + offset, size - offset);
+ if ( (len == -1) && (errno == EINTR) )
+ continue;
+ if ( len == 0 )
+ errno = 0;
+ if ( len <= 0 )
+ return -1;
+ offset += len;
+ }
+
+ return 0;
+}
+
+int write_exact(int fd, const void *data, size_t size)
+{
+ size_t offset = 0;
+ ssize_t len;
+
+ while ( offset < size )
+ {
+ len = write(fd, (const char *)data + offset, size - offset);
+ if ( (len == -1) && (errno == EINTR) )
+ continue;
+ if ( len <= 0 )
+ return -1;
+ offset += len;
+ }
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_UTILS_H_
+#define _TAPDISK_UTILS_H_
+
+#include <inttypes.h>
+
+#define MAX_NAME_LEN 1000
+
+void tapdisk_start_logging(const char *);
+void tapdisk_stop_logging(void);
+int tapdisk_set_resource_limits(void);
+int tapdisk_namedup(char **, const char *);
+int tapdisk_get_image_size(int, uint64_t *, uint32_t *);
+int tapdisk_linux_version(void);
+
+int read_exact(int fd, void *data, size_t size); /* EOF => -1, errno=0 */
+int write_exact(int fd, const void *data, size_t size);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <regex.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <libgen.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#ifdef MEMSHR
+#include <memshr.h>
+#endif
+
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+#include "tapdisk-vbd.h"
+#include "blktap2.h"
+
+#define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a)
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+#if 1
+#define ASSERT(p) \
+ do { \
+ if (!(p)) { \
+ DPRINTF("Assertion '%s' failed, line %d, " \
+ "file %s", #p, __LINE__, __FILE__); \
+ abort(); \
+ } \
+ } while (0)
+#else
+#define ASSERT(p) ((void)0)
+#endif
+
+
+#define TD_VBD_EIO_RETRIES 10
+#define TD_VBD_EIO_SLEEP 1
+#define TD_VBD_WATCHDOG_TIMEOUT 10
+
+static void tapdisk_vbd_ring_event(event_id_t, char, void *);
+static void tapdisk_vbd_callback(void *, blkif_response_t *);
+
+/*
+ * initialization
+ */
+
+static inline void
+tapdisk_vbd_initialize_vreq(td_vbd_request_t *vreq)
+{
+ memset(vreq, 0, sizeof(td_vbd_request_t));
+ INIT_LIST_HEAD(&vreq->next);
+}
+
+void
+tapdisk_vbd_free(td_vbd_t *vbd)
+{
+ if (vbd) {
+ tapdisk_vbd_free_stack(vbd);
+ list_del_init(&vbd->next);
+ free(vbd->name);
+ free(vbd);
+ }
+}
+
+td_vbd_t*
+tapdisk_vbd_create(uint16_t uuid)
+{
+ td_vbd_t *vbd;
+ int i;
+
+ vbd = calloc(1, sizeof(td_vbd_t));
+ if (!vbd) {
+ EPRINTF("failed to allocate tapdisk state\n");
+ return NULL;
+ }
+
+ vbd->uuid = uuid;
+ vbd->minor = -1;
+ vbd->ring.fd = -1;
+
+ /* default blktap ring completion */
+ vbd->callback = tapdisk_vbd_callback;
+ vbd->argument = vbd;
+
+#ifdef MEMSHR
+ memshr_vbd_initialize();
+#endif
+
+ INIT_LIST_HEAD(&vbd->driver_stack);
+ INIT_LIST_HEAD(&vbd->images);
+ INIT_LIST_HEAD(&vbd->new_requests);
+ INIT_LIST_HEAD(&vbd->pending_requests);
+ INIT_LIST_HEAD(&vbd->failed_requests);
+ INIT_LIST_HEAD(&vbd->completed_requests);
+ INIT_LIST_HEAD(&vbd->next);
+ gettimeofday(&vbd->ts, NULL);
+
+ for (i = 0; i < MAX_REQUESTS; i++)
+ tapdisk_vbd_initialize_vreq(vbd->request_list + i);
+
+ return vbd;
+}
+
+int
+tapdisk_vbd_initialize(uint16_t uuid)
+{
+ td_vbd_t *vbd;
+
+ vbd = tapdisk_server_get_vbd(uuid);
+ if (vbd) {
+ EPRINTF("duplicate vbds! %u\n", uuid);
+ return -EEXIST;
+ }
+
+ vbd = tapdisk_vbd_create(uuid);
+
+ tapdisk_server_add_vbd(vbd);
+
+ return 0;
+}
+
+void
+tapdisk_vbd_set_callback(td_vbd_t *vbd, td_vbd_cb_t callback, void *argument)
+{
+ vbd->callback = callback;
+ vbd->argument = argument;
+}
+
+static int
+tapdisk_vbd_validate_chain(td_vbd_t *vbd)
+{
+ int err;
+ td_image_t *image, *parent, *tmp;
+
+ DPRINTF("VBD CHAIN:\n");
+
+ tapdisk_vbd_for_each_image(vbd, image, tmp) {
+ DPRINTF("%s: %d\n", image->name, image->type);
+
+ if (tapdisk_vbd_is_last_image(vbd, image))
+ break;
+
+ parent = tapdisk_vbd_next_image(image);
+ err = td_validate_parent(image, parent);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+void
+tapdisk_vbd_close_vdi(td_vbd_t *vbd)
+{
+ td_image_t *image, *tmp;
+
+ tapdisk_vbd_for_each_image(vbd, image, tmp) {
+ td_close(image);
+ tapdisk_image_free(image);
+ }
+
+ INIT_LIST_HEAD(&vbd->images);
+ td_flag_set(vbd->state, TD_VBD_CLOSED);
+
+ tapdisk_vbd_free_stack(vbd);
+}
+
+static int
+tapdisk_vbd_add_block_cache(td_vbd_t *vbd)
+{
+ int err;
+ td_driver_t *driver;
+ td_image_t *cache, *image, *target, *tmp;
+
+ target = NULL;
+
+ tapdisk_vbd_for_each_image(vbd, image, tmp)
+ if (td_flag_test(image->flags, TD_OPEN_RDONLY) &&
+ td_flag_test(image->flags, TD_OPEN_SHAREABLE)) {
+ target = image;
+ break;
+ }
+
+ if (!target)
+ return 0;
+
+ cache = tapdisk_image_allocate(target->name,
+ DISK_TYPE_BLOCK_CACHE,
+ target->storage,
+ target->flags,
+ target->private);
+ if (!cache)
+ return -ENOMEM;
+
+ /* try to load existing cache */
+ err = td_load(cache);
+ if (!err)
+ goto done;
+
+ /* hack driver to send open() correct image size */
+ if (!target->driver) {
+ err = -ENODEV;
+ goto fail;
+ }
+
+ cache->driver = tapdisk_driver_allocate(cache->type,
+ cache->name,
+ cache->flags,
+ cache->storage);
+ if (!cache->driver) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ cache->driver->info = target->driver->info;
+
+ /* try to open new cache */
+ err = td_open(cache);
+ if (!err)
+ goto done;
+
+fail:
+ /* give up */
+ tapdisk_image_free(target);
+ return err;
+
+done:
+ /* insert cache before image */
+ list_add(&cache->next, target->next.prev);
+ return 0;
+}
+
+static int
+tapdisk_vbd_add_dirty_log(td_vbd_t *vbd)
+{
+ int err;
+ td_driver_t *driver;
+ td_image_t *log, *parent;
+
+ driver = NULL;
+ log = NULL;
+
+ parent = tapdisk_vbd_first_image(vbd);
+
+ log = tapdisk_image_allocate(parent->name,
+ DISK_TYPE_LOG,
+ parent->storage,
+ parent->flags,
+ vbd);
+ if (!log)
+ return -ENOMEM;
+
+ driver = tapdisk_driver_allocate(log->type,
+ log->name,
+ log->flags,
+ log->storage);
+ if (!driver) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ driver->info = parent->driver->info;
+ log->driver = driver;
+
+ err = td_open(log);
+ if (err)
+ goto fail;
+
+ list_add(&log->next, &vbd->images);
+ return 0;
+
+fail:
+ tapdisk_image_free(log);
+ return err;
+}
+
+static int
+tapdisk_vbd_open_level(td_vbd_t *vbd, struct list_head *head,
+ const char *params, int driver_type,
+ td_disk_info_t *driver_info, td_flag_t flags)
+{
+ const char *name;
+ int type, err;
+ td_image_t *image;
+ td_disk_id_t id;
+ td_driver_t *driver;
+
+ name = params;
+ id.name = NULL;
+ type = driver_type;
+ INIT_LIST_HEAD(head);
+
+ for (;;) {
+ err = -ENOMEM;
+ image = tapdisk_image_allocate(name, type,
+ vbd->storage, flags, vbd);
+
+ free(id.name);
+
+ if (!image)
+ goto out;
+
+
+ /* this breaks if a driver modifies its info within a layer */
+ err = __td_open(image, driver_info);
+ if (err)
+ goto out;
+
+ /* TODO: non-sink drivers that don't care about their child
+ * currently return EINVAL. Could return TD_PARENT_OK or
+ * TD_ANY_PARENT */
+
+ err = td_get_parent_id(image, &id);
+ if (err && (err != TD_NO_PARENT && err != -EINVAL)) {
+ td_close(image);
+ goto out;
+ }
+
+ /* add this image to the end of the list */
+ list_add_tail(&image->next, head);
+ image = NULL;
+
+ /* if the image does not have a parent we return the
+ * list of images generated by this level of the stack */
+ if (err == TD_NO_PARENT || err == -EINVAL) {
+ err = 0;
+ goto out;
+ }
+
+ name = id.name;
+ type = id.drivertype;
+
+ flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE);
+ }
+
+out:
+ if (err) {
+ if (image) {
+ td_close(image);
+ tapdisk_image_free(image);
+ }
+ while (!list_empty(head)) {
+ image = list_entry(&head->next, td_image_t, next);
+ td_close(image);
+ tapdisk_image_free(image);
+ }
+ }
+
+ return err;
+}
+
+static int
+__tapdisk_vbd_open_vdi(td_vbd_t *vbd, td_flag_t extra_flags)
+{
+ int err;
+ td_flag_t flags;
+ td_image_t *tmp;
+ td_vbd_driver_info_t *driver_info;
+ struct list_head *images;
+ td_disk_info_t *parent_info = NULL;
+
+ if (list_empty(&vbd->driver_stack))
+ return -ENOENT;
+
+ flags = (vbd->flags & ~TD_OPEN_SHAREABLE) | extra_flags;
+
+ /* loop on each user specified driver.
+ * NOTE: driver_info is in reverse order. That is, the first
+ * item is the 'parent' or 'sink' driver */
+ list_for_each_entry(driver_info, &vbd->driver_stack, next) {
+ LIST_HEAD(images);
+
+ err = tapdisk_vbd_open_level(vbd, &images,
+ driver_info->params,
+ driver_info->type,
+ parent_info, flags);
+ if (err)
+ goto fail;
+
+ /* after each loop,
+ * append the created stack to the result stack */
+ list_splice(&images, &vbd->images);
+
+ /* set the parent_info to the first diskinfo on the stack */
+ tmp = tapdisk_vbd_first_image(vbd);
+ parent_info = &tmp->info;
+ }
+
+ if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) {
+ err = tapdisk_vbd_add_dirty_log(vbd);
+ if (err)
+ goto fail;
+ }
+
+ if (td_flag_test(vbd->flags, TD_OPEN_ADD_CACHE)) {
+ err = tapdisk_vbd_add_block_cache(vbd);
+ if (err)
+ goto fail;
+ }
+
+ err = tapdisk_vbd_validate_chain(vbd);
+ if (err)
+ goto fail;
+
+ td_flag_clear(vbd->state, TD_VBD_CLOSED);
+
+ return 0;
+
+fail:
+ tapdisk_vbd_close_vdi(vbd);
+ return err;
+}
+
+/* this populates a vbd type based on path */
+int
+tapdisk_vbd_parse_stack(td_vbd_t *vbd, const char *path)
+{
+ int err;
+ char *params, *driver_str;
+ td_vbd_driver_info_t *driver;
+
+ err = tapdisk_namedup(¶ms, path);
+ if (err)
+ return err;
+
+ /* tokenize params based on pipe '|' */
+ driver_str = strtok(params, "|");
+ while (driver_str != NULL) {
+ const char *path;
+ int type;
+
+ /* parse driver info and add to vbd */
+ driver = calloc(1, sizeof(td_vbd_driver_info_t));
+ if (!driver) {
+ PERROR("malloc");
+ err = -errno;
+ goto out;
+ }
+ INIT_LIST_HEAD(&driver->next);
+
+ err = tapdisk_parse_disk_type(driver_str, &path, &type);
+ if (err) {
+ free(driver);
+ goto out;
+ }
+
+ driver->type = type;
+ driver->params = strdup(path);
+ if (!driver->params) {
+ err = -ENOMEM;
+ free(driver);
+ goto out;
+ }
+
+ /* build the list backwards as the last driver will be the
+ * first driver to open in the stack */
+ list_add(&driver->next, &vbd->driver_stack);
+
+ /* get next driver string */
+ driver_str = strtok(NULL, "|");
+ }
+
+out:
+ free(params);
+ if (err)
+ tapdisk_vbd_free_stack(vbd);
+
+ return err;
+}
+
+void
+tapdisk_vbd_free_stack(td_vbd_t *vbd)
+{
+ td_vbd_driver_info_t *driver;
+
+ while (!list_empty(&vbd->driver_stack)) {
+ driver = list_entry(vbd->driver_stack.next,
+ td_vbd_driver_info_t, next);
+ list_del(&driver->next);
+ free(driver->params);
+ free(driver);
+ }
+}
+
+/* NOTE: driver type, etc. must be set */
+int
+tapdisk_vbd_open_stack(td_vbd_t *vbd, uint16_t storage, td_flag_t flags)
+{
+ int i, err = 0;
+
+ vbd->flags = flags;
+ vbd->storage = storage;
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = __tapdisk_vbd_open_vdi(vbd, 0);
+ if (err != -EIO)
+ break;
+
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+ if (err)
+ goto fail;
+
+ return 0;
+
+ fail:
+ return err;
+}
+
+int
+tapdisk_vbd_open_vdi(td_vbd_t *vbd, const char *path,
+ uint16_t drivertype, uint16_t storage, td_flag_t flags)
+{
+ int i, err;
+ const struct tap_disk *ops;
+
+ ops = tapdisk_disk_drivers[drivertype];
+ if (!ops)
+ return -EINVAL;
+ DPRINTF("Loaded %s driver for vbd %u %s 0x%08x\n",
+ ops->disk_type, vbd->uuid, path, flags);
+
+ err = tapdisk_namedup(&vbd->name, path);
+ if (err)
+ return err;
+
+ vbd->flags = flags;
+ vbd->storage = storage;
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = __tapdisk_vbd_open_vdi(vbd, 0);
+ if (err != -EIO)
+ break;
+
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ free(vbd->name);
+ vbd->name = NULL;
+ return err;
+}
+
+static int
+tapdisk_vbd_register_event_watches(td_vbd_t *vbd)
+{
+ event_id_t id;
+
+ id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ vbd->ring.fd, 0,
+ tapdisk_vbd_ring_event, vbd);
+ if (id < 0)
+ return id;
+
+ vbd->ring_event_id = id;
+
+ return 0;
+}
+
+static void
+tapdisk_vbd_unregister_events(td_vbd_t *vbd)
+{
+ if (vbd->ring_event_id)
+ tapdisk_server_unregister_event(vbd->ring_event_id);
+}
+
+static int
+tapdisk_vbd_map_device(td_vbd_t *vbd, const char *devname)
+{
+
+ int err, psize;
+ td_ring_t *ring;
+
+ ring = &vbd->ring;
+ psize = getpagesize();
+
+ ring->fd = open(devname, O_RDWR);
+ if (ring->fd == -1) {
+ err = -errno;
+ EPRINTF("failed to open %s: %d\n", devname, err);
+ goto fail;
+ }
+
+ ring->mem = mmap(0, psize * BLKTAP_MMAP_REGION_SIZE,
+ PROT_READ | PROT_WRITE, MAP_SHARED, ring->fd, 0);
+ if (ring->mem == MAP_FAILED) {
+ err = -errno;
+ EPRINTF("failed to mmap %s: %d\n", devname, err);
+ goto fail;
+ }
+
+ ring->sring = (blkif_sring_t *)((unsigned long)ring->mem);
+ BACK_RING_INIT(&ring->fe_ring, ring->sring, psize);
+
+ ring->vstart =
+ (unsigned long)ring->mem + (BLKTAP_RING_PAGES * psize);
+
+ ioctl(ring->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE);
+
+ return 0;
+
+fail:
+ if (ring->mem && ring->mem != MAP_FAILED)
+ munmap(ring->mem, psize * BLKTAP_MMAP_REGION_SIZE);
+ if (ring->fd != -1)
+ close(ring->fd);
+ ring->fd = -1;
+ ring->mem = NULL;
+ return err;
+}
+
+static int
+tapdisk_vbd_unmap_device(td_vbd_t *vbd)
+{
+ int psize;
+
+ psize = getpagesize();
+
+ if (vbd->ring.fd != -1)
+ close(vbd->ring.fd);
+ if (vbd->ring.mem > 0)
+ munmap(vbd->ring.mem, psize * BLKTAP_MMAP_REGION_SIZE);
+
+ return 0;
+}
+
+void
+tapdisk_vbd_detach(td_vbd_t *vbd)
+{
+ tapdisk_vbd_unregister_events(vbd);
+
+ tapdisk_vbd_unmap_device(vbd);
+ vbd->minor = -1;
+}
+
+
+int
+tapdisk_vbd_attach(td_vbd_t *vbd, const char *devname, int minor)
+{
+ int err;
+
+ err = tapdisk_vbd_map_device(vbd, devname);
+ if (err)
+ goto fail;
+
+ err = tapdisk_vbd_register_event_watches(vbd);
+ if (err)
+ goto fail;
+
+ vbd->minor = minor;
+
+ return 0;
+
+fail:
+ tapdisk_vbd_detach(vbd);
+
+ return err;
+}
+
+int
+tapdisk_vbd_open(td_vbd_t *vbd, const char *name, uint16_t type,
+ uint16_t storage, int minor, const char *ring, td_flag_t flags)
+{
+ int err;
+
+ err = tapdisk_vbd_open_stack(vbd, storage, flags);
+ if (err)
+ goto out;
+
+ err = tapdisk_vbd_attach(vbd, ring, minor);
+ if (err)
+ goto out;
+
+ return 0;
+
+out:
+ tapdisk_vbd_detach(vbd);
+ tapdisk_vbd_close_vdi(vbd);
+ free(vbd->name);
+ vbd->name = NULL;
+ return err;
+}
+
+static void
+tapdisk_vbd_queue_count(td_vbd_t *vbd, int *new,
+ int *pending, int *failed, int *completed)
+{
+ int n, p, f, c;
+ td_vbd_request_t *vreq, *tvreq;
+
+ n = 0;
+ p = 0;
+ f = 0;
+ c = 0;
+
+ tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->new_requests)
+ n++;
+
+ tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->pending_requests)
+ p++;
+
+ tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->failed_requests)
+ f++;
+
+ tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->completed_requests)
+ c++;
+
+ *new = n;
+ *pending = p;
+ *failed = f;
+ *completed = c;
+}
+
+static int
+tapdisk_vbd_shutdown(td_vbd_t *vbd)
+{
+ int new, pending, failed, completed;
+
+ if (!list_empty(&vbd->pending_requests))
+ return -EAGAIN;
+
+ tapdisk_vbd_kick(vbd);
+ tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed);
+
+ DPRINTF("%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, "
+ "failed: 0x%02x, completed: 0x%02x\n",
+ vbd->name, vbd->state, new, pending, failed, completed);
+ DPRINTF("last activity: %010ld.%06lld, errors: 0x%04"PRIx64", "
+ "retries: 0x%04"PRIx64", received: 0x%08"PRIx64", "
+ "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n",
+ vbd->ts.tv_sec, (unsigned long long)vbd->ts.tv_usec,
+ vbd->errors, vbd->retries, vbd->received, vbd->returned,
+ vbd->kicked);
+
+ tapdisk_vbd_close_vdi(vbd);
+ tapdisk_vbd_detach(vbd);
+ tapdisk_server_remove_vbd(vbd);
+ tapdisk_vbd_free(vbd);
+
+ tlog_print_errors();
+
+ return 0;
+}
+
+int
+tapdisk_vbd_close(td_vbd_t *vbd)
+{
+ /*
+ * don't close if any requests are pending in the aio layer
+ */
+ if (!list_empty(&vbd->pending_requests))
+ goto fail;
+
+ /*
+ * if the queue is still active and we have more
+ * requests, try to complete them before closing.
+ */
+ if (tapdisk_vbd_queue_ready(vbd) &&
+ (!list_empty(&vbd->new_requests) ||
+ !list_empty(&vbd->failed_requests) ||
+ !list_empty(&vbd->completed_requests)))
+ goto fail;
+
+ return tapdisk_vbd_shutdown(vbd);
+
+fail:
+ td_flag_set(vbd->state, TD_VBD_SHUTDOWN_REQUESTED);
+ DBG(TLOG_WARN, "%s: requests pending\n", vbd->name);
+ return -EAGAIN;
+}
+
+/*
+ * control operations
+ */
+
+void
+tapdisk_vbd_debug(td_vbd_t *vbd)
+{
+ td_image_t *image, *tmp;
+ int new, pending, failed, completed;
+
+ tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed);
+
+ DBG(TLOG_WARN, "%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, "
+ "failed: 0x%02x, completed: 0x%02x, last activity: %010ld.%06lld, "
+ "errors: 0x%04"PRIx64", retries: 0x%04"PRIx64", received: 0x%08"PRIx64", "
+ "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n",
+ vbd->name, vbd->state, new, pending, failed, completed,
+ vbd->ts.tv_sec, (unsigned long long)vbd->ts.tv_usec,
+ vbd->errors, vbd->retries,
+ vbd->received, vbd->returned, vbd->kicked);
+
+ tapdisk_vbd_for_each_image(vbd, image, tmp)
+ td_debug(image);
+}
+
+static void
+tapdisk_vbd_drop_log(td_vbd_t *vbd)
+{
+ if (td_flag_test(vbd->state, TD_VBD_LOG_DROPPED))
+ return;
+
+ tapdisk_vbd_debug(vbd);
+ tlog_flush();
+ td_flag_set(vbd->state, TD_VBD_LOG_DROPPED);
+}
+
+int
+tapdisk_vbd_get_image_info(td_vbd_t *vbd, image_t *img)
+{
+ td_image_t *image;
+
+ memset(img, 0, sizeof(image_t));
+
+ if (list_empty(&vbd->images))
+ return -EINVAL;
+
+ image = tapdisk_vbd_first_image(vbd);
+ img->size = image->info.size;
+ img->secsize = image->info.sector_size;
+ img->info = image->info.info;
+
+ return 0;
+}
+
+int
+tapdisk_vbd_queue_ready(td_vbd_t *vbd)
+{
+ return (!td_flag_test(vbd->state, TD_VBD_DEAD) &&
+ !td_flag_test(vbd->state, TD_VBD_CLOSED) &&
+ !td_flag_test(vbd->state, TD_VBD_QUIESCED) &&
+ !td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED));
+}
+
+int
+tapdisk_vbd_retry_needed(td_vbd_t *vbd)
+{
+ return td_flag_test(vbd->state, TD_VBD_RETRY_NEEDED);
+}
+
+int
+tapdisk_vbd_lock(td_vbd_t *vbd)
+{
+ return 0;
+}
+
+int
+tapdisk_vbd_quiesce_queue(td_vbd_t *vbd)
+{
+ if (!list_empty(&vbd->pending_requests)) {
+ td_flag_set(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+ return -EAGAIN;
+ }
+
+ td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+ td_flag_set(vbd->state, TD_VBD_QUIESCED);
+ return 0;
+}
+
+int
+tapdisk_vbd_start_queue(td_vbd_t *vbd)
+{
+ td_flag_clear(vbd->state, TD_VBD_QUIESCED);
+ td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+ return 0;
+}
+
+int
+tapdisk_vbd_kill_queue(td_vbd_t *vbd)
+{
+ tapdisk_vbd_quiesce_queue(vbd);
+ td_flag_set(vbd->state, TD_VBD_DEAD);
+ return 0;
+}
+
+static int
+tapdisk_vbd_open_image(td_vbd_t *vbd, td_image_t *image)
+{
+ int err;
+ td_image_t *parent;
+
+ err = td_open(image);
+ if (err)
+ return err;
+
+ if (!tapdisk_vbd_is_last_image(vbd, image)) {
+ parent = tapdisk_vbd_next_image(image);
+ err = td_validate_parent(image, parent);
+ if (err) {
+ td_close(image);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_vbd_close_and_reopen_image(td_vbd_t *vbd, td_image_t *image)
+{
+ int i, err = 0;
+
+ td_close(image);
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = tapdisk_vbd_open_image(vbd, image);
+ if (err != -EIO)
+ break;
+
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+
+ if (err)
+ td_flag_set(vbd->state, TD_VBD_CLOSED);
+
+ return err;
+}
+
+int
+tapdisk_vbd_pause(td_vbd_t *vbd)
+{
+ int err;
+
+ td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
+
+ err = tapdisk_vbd_quiesce_queue(vbd);
+ if (err)
+ return err;
+
+ tapdisk_vbd_close_vdi(vbd);
+
+ td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+ td_flag_set(vbd->state, TD_VBD_PAUSED);
+
+ return 0;
+}
+
+int
+tapdisk_vbd_resume(td_vbd_t *vbd, const char *path, uint16_t drivertype)
+{
+ int i, err = 0;
+
+ if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
+ EPRINTF("resume request for unpaused vbd %s\n", vbd->name);
+ return -EINVAL;
+ }
+
+ if (path) {
+ free(vbd->name);
+ vbd->name = strdup(path);
+ if (!vbd->name) {
+ EPRINTF("copying new vbd %s name failed\n", path);
+ return -EINVAL;
+ }
+ }
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT);
+ if (err != -EIO)
+ break;
+
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+
+ if (err)
+ return err;
+
+ tapdisk_vbd_start_queue(vbd);
+ td_flag_clear(vbd->state, TD_VBD_PAUSED);
+ td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+ tapdisk_vbd_check_state(vbd);
+
+ return 0;
+}
+
+int
+tapdisk_vbd_kick(td_vbd_t *vbd)
+{
+ int n;
+ td_ring_t *ring;
+
+ tapdisk_vbd_check_state(vbd);
+
+ ring = &vbd->ring;
+ if (!ring->sring)
+ return 0;
+
+ n = (ring->fe_ring.rsp_prod_pvt - ring->fe_ring.sring->rsp_prod);
+ if (!n)
+ return 0;
+
+ vbd->kicked += n;
+ RING_PUSH_RESPONSES(&ring->fe_ring);
+ ioctl(ring->fd, BLKTAP_IOCTL_KICK_FE, 0);
+
+ DBG(TLOG_INFO, "kicking %d: rec: 0x%08"PRIx64", ret: 0x%08"PRIx64", kicked: "
+ "0x%08"PRIx64"\n", n, vbd->received, vbd->returned, vbd->kicked);
+
+ return n;
+}
+
+static inline void
+tapdisk_vbd_write_response_to_ring(td_vbd_t *vbd, blkif_response_t *rsp)
+{
+ td_ring_t *ring;
+ blkif_response_t *rspp;
+
+ ring = &vbd->ring;
+ rspp = RING_GET_RESPONSE(&ring->fe_ring, ring->fe_ring.rsp_prod_pvt);
+ memcpy(rspp, rsp, sizeof(blkif_response_t));
+ ring->fe_ring.rsp_prod_pvt++;
+}
+
+static void
+tapdisk_vbd_callback(void *arg, blkif_response_t *rsp)
+{
+ td_vbd_t *vbd = (td_vbd_t *)arg;
+ tapdisk_vbd_write_response_to_ring(vbd, rsp);
+}
+
+static void
+tapdisk_vbd_make_response(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+ blkif_request_t tmp;
+ blkif_response_t *rsp;
+
+ tmp = vreq->req;
+ rsp = (blkif_response_t *)&vreq->req;
+
+ rsp->id = tmp.id;
+ rsp->operation = tmp.operation;
+ rsp->status = vreq->status;
+
+ DBG(TLOG_DBG, "writing req %d, sec 0x%08"PRIx64", res %d to ring\n",
+ (int)tmp.id, tmp.sector_number, vreq->status);
+
+ if (rsp->status != BLKIF_RSP_OKAY)
+ ERR(EIO, "returning BLKIF_RSP %d", rsp->status);
+
+ vbd->returned++;
+ vbd->callback(vbd->argument, rsp);
+}
+
+void
+tapdisk_vbd_check_state(td_vbd_t *vbd)
+{
+ td_vbd_request_t *vreq, *tmp;
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests)
+ if (vreq->num_retries >= TD_VBD_MAX_RETRIES)
+ tapdisk_vbd_complete_vbd_request(vbd, vreq);
+
+ if (!list_empty(&vbd->new_requests) ||
+ !list_empty(&vbd->failed_requests))
+ tapdisk_vbd_issue_requests(vbd);
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->completed_requests) {
+ tapdisk_vbd_make_response(vbd, vreq);
+ list_del(&vreq->next);
+ tapdisk_vbd_initialize_vreq(vreq);
+ }
+
+ if (td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED))
+ tapdisk_vbd_quiesce_queue(vbd);
+
+ if (td_flag_test(vbd->state, TD_VBD_PAUSE_REQUESTED))
+ tapdisk_vbd_pause(vbd);
+
+ if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+ tapdisk_vbd_close(vbd);
+}
+
+void
+tapdisk_vbd_check_progress(td_vbd_t *vbd)
+{
+ int diff;
+ struct timeval now;
+
+ if (list_empty(&vbd->pending_requests))
+ return;
+
+ gettimeofday(&now, NULL);
+ diff = now.tv_sec - vbd->ts.tv_sec;
+
+ if (diff >= TD_VBD_WATCHDOG_TIMEOUT) {
+ DBG(TLOG_WARN, "%s: watchdog timeout: pending requests "
+ "idle for %d seconds\n", vbd->name, diff);
+ tapdisk_vbd_drop_log(vbd);
+ return;
+ }
+
+ tapdisk_server_set_max_timeout(TD_VBD_WATCHDOG_TIMEOUT - diff);
+}
+
+/*
+ * request submission
+ */
+
+static int
+tapdisk_vbd_check_queue(td_vbd_t *vbd)
+{
+ int err;
+ td_image_t *image;
+
+ if (list_empty(&vbd->images))
+ return -ENOSYS;
+
+ if (!tapdisk_vbd_queue_ready(vbd))
+ return -EAGAIN;
+
+ if (!vbd->reopened) {
+ if (td_flag_test(vbd->state, TD_VBD_LOCKING)) {
+ err = tapdisk_vbd_lock(vbd);
+ if (err)
+ return err;
+ }
+
+ image = tapdisk_vbd_first_image(vbd);
+ td_flag_set(image->flags, TD_OPEN_STRICT);
+
+ if (tapdisk_vbd_close_and_reopen_image(vbd, image))
+ EPRINTF("reopening disks failed\n");
+ else {
+ DPRINTF("reopening disks succeeded\n");
+ vbd->reopened = 1;
+ }
+ }
+
+ return 0;
+}
+
+void
+tapdisk_vbd_complete_vbd_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+ if (!vreq->submitting && !vreq->secs_pending) {
+ if (vreq->status == BLKIF_RSP_ERROR &&
+ vreq->num_retries < TD_VBD_MAX_RETRIES &&
+ !td_flag_test(vbd->state, TD_VBD_DEAD) &&
+ !td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+ tapdisk_vbd_move_request(vreq, &vbd->failed_requests);
+ else
+ tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+ }
+}
+
+static uint64_t
+tapdisk_vbd_breq_get_sector(blkif_request_t *breq, td_request_t treq)
+{
+ int seg, nsects;
+ uint64_t sector_nr = breq->sector_number;
+
+ for(seg=0; seg < treq.sidx; seg++) {
+ nsects = breq->seg[seg].last_sect - breq->seg[seg].first_sect + 1;
+ sector_nr += nsects;
+ }
+
+ return sector_nr;
+}
+
+static void
+__tapdisk_vbd_complete_td_request(td_vbd_t *vbd, td_vbd_request_t *vreq,
+ td_request_t treq, int res)
+{
+ int err;
+ td_image_t *image = treq.image;
+
+ err = (res <= 0 ? res : -res);
+ vbd->secs_pending -= treq.secs;
+ vreq->secs_pending -= treq.secs;
+
+ vreq->blocked = treq.blocked;
+
+ if (err) {
+ vreq->status = BLKIF_RSP_ERROR;
+ vreq->error = (vreq->error ? : err);
+ if (err != -EBUSY) {
+ vbd->errors++;
+ ERR(err, "req %"PRIu64": %s 0x%04x secs to "
+ "0x%08"PRIx64, vreq->req.id,
+ (treq.op == TD_OP_WRITE ? "write" : "read"),
+ treq.secs, treq.sec);
+ }
+ } else {
+#ifdef MEMSHR
+ if (treq.op == TD_OP_READ
+ && td_flag_test(image->flags, TD_OPEN_RDONLY)) {
+ share_tuple_t hnd = treq.memshr_hnd;
+ uint16_t uid = image->memshr_id;
+ blkif_request_t *breq = &vreq->req;
+ uint64_t sec = tapdisk_vbd_breq_get_sector(breq, treq);
+ int secs = breq->seg[treq.sidx].last_sect -
+ breq->seg[treq.sidx].first_sect + 1;
+
+ if (hnd.handle != 0)
+ memshr_vbd_complete_ro_request(hnd, uid,
+ sec, secs);
+ }
+#endif
+ }
+
+ tapdisk_vbd_complete_vbd_request(vbd, vreq);
+}
+
+static void
+__tapdisk_vbd_reissue_td_request(td_vbd_t *vbd,
+ td_image_t *image, td_request_t treq)
+{
+ td_image_t *parent;
+ td_vbd_request_t *vreq;
+
+ vreq = (td_vbd_request_t *)treq.private;
+ gettimeofday(&vreq->last_try, NULL);
+
+ vreq->submitting++;
+
+ if (tapdisk_vbd_is_last_image(vbd, image)) {
+ memset(treq.buf, 0, treq.secs << SECTOR_SHIFT);
+ td_complete_request(treq, 0);
+ goto done;
+ }
+
+ parent = tapdisk_vbd_next_image(image);
+ treq.image = parent;
+
+ /* return zeros for requests that extend beyond end of parent image */
+ if (treq.sec + treq.secs > parent->info.size) {
+ td_request_t clone = treq;
+
+ if (parent->info.size > treq.sec) {
+ int secs = parent->info.size - treq.sec;
+ clone.sec += secs;
+ clone.secs -= secs;
+ clone.buf += (secs << SECTOR_SHIFT);
+ treq.secs = secs;
+ } else
+ treq.secs = 0;
+
+ memset(clone.buf, 0, clone.secs << SECTOR_SHIFT);
+ td_complete_request(clone, 0);
+
+ if (!treq.secs)
+ goto done;
+ }
+
+ switch (treq.op) {
+ case TD_OP_WRITE:
+ td_queue_write(parent, treq);
+ break;
+
+ case TD_OP_READ:
+#ifdef MEMSHR
+ if(td_flag_test(parent->flags, TD_OPEN_RDONLY)) {
+ int ret, seg = treq.sidx;
+ blkif_request_t *breq = &vreq->req;
+
+ ret = memshr_vbd_issue_ro_request(treq.buf,
+ breq->seg[seg].gref,
+ parent->memshr_id,
+ treq.sec,
+ treq.secs,
+ &treq.memshr_hnd);
+ if(ret == 0) {
+ /* Reset memshr handle. This'll prevent
+ * memshr_vbd_complete_ro_request being called
+ */
+ treq.memshr_hnd.handle = 0;
+ td_complete_request(treq, 0);
+ } else
+ td_queue_read(parent, treq);
+ } else
+#endif
+ td_queue_read(parent, treq);
+ break;
+ }
+
+done:
+ vreq->submitting--;
+ if (!vreq->secs_pending)
+ tapdisk_vbd_complete_vbd_request(vbd, vreq);
+}
+
+void
+tapdisk_vbd_forward_request(td_request_t treq)
+{
+ td_vbd_t *vbd;
+ td_image_t *image;
+ td_vbd_request_t *vreq;
+
+ image = treq.image;
+ vbd = (td_vbd_t *)image->private;
+ vreq = (td_vbd_request_t *)treq.private;
+
+ gettimeofday(&vbd->ts, NULL);
+
+ if (tapdisk_vbd_queue_ready(vbd))
+ __tapdisk_vbd_reissue_td_request(vbd, image, treq);
+ else
+ __tapdisk_vbd_complete_td_request(vbd, vreq, treq, -EIO);
+}
+
+static void
+tapdisk_vbd_complete_td_request(td_request_t treq, int res)
+{
+ td_vbd_t *vbd;
+ td_image_t *image;
+ td_vbd_request_t *vreq;
+
+ image = treq.image;
+ vbd = (td_vbd_t *)image->private;
+ vreq = (td_vbd_request_t *)treq.private;
+
+ gettimeofday(&vbd->ts, NULL);
+ DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" "
+ "secs 0x%04x buf %p op %d res %d\n", image->name,
+ (int)treq.id, treq.sidx, treq.sec, treq.secs,
+ treq.buf, (int)vreq->req.operation, res);
+
+ __tapdisk_vbd_complete_td_request(vbd, vreq, treq, res);
+}
+
+static int
+tapdisk_vbd_issue_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+ char *page;
+ td_ring_t *ring;
+ td_image_t *image;
+ td_request_t treq;
+ uint64_t sector_nr;
+ blkif_request_t *req;
+ int i, err, id, nsects;
+
+ req = &vreq->req;
+ id = req->id;
+ ring = &vbd->ring;
+ sector_nr = req->sector_number;
+ image = tapdisk_vbd_first_image(vbd);
+
+ vreq->submitting = 1;
+ gettimeofday(&vbd->ts, NULL);
+ gettimeofday(&vreq->last_try, NULL);
+ tapdisk_vbd_move_request(vreq, &vbd->pending_requests);
+
+#if 0
+ err = tapdisk_vbd_check_queue(vbd);
+ if (err)
+ goto fail;
+#endif
+
+ err = tapdisk_image_check_ring_request(image, req);
+ if (err)
+ goto fail;
+
+ for (i = 0; i < req->nr_segments; i++) {
+ nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1;
+ page = (char *)MMAP_VADDR(ring->vstart,
+ (unsigned long)req->id, i);
+ page += (req->seg[i].first_sect << SECTOR_SHIFT);
+
+ treq.id = id;
+ treq.sidx = i;
+ treq.blocked = 0;
+ treq.buf = page;
+ treq.sec = sector_nr;
+ treq.secs = nsects;
+ treq.image = image;
+ treq.cb = tapdisk_vbd_complete_td_request;
+ treq.cb_data = NULL;
+ treq.private = vreq;
+
+ DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" secs 0x%04x "
+ "buf %p op %d\n", image->name, id, i, treq.sec, treq.secs,
+ treq.buf, (int)req->operation);
+
+ vreq->secs_pending += nsects;
+ vbd->secs_pending += nsects;
+
+ switch (req->operation) {
+ case BLKIF_OP_WRITE:
+ treq.op = TD_OP_WRITE;
+ td_queue_write(image, treq);
+ break;
+
+ case BLKIF_OP_READ:
+ treq.op = TD_OP_READ;
+ td_queue_read(image, treq);
+ break;
+ }
+
+ sector_nr += nsects;
+ }
+
+ err = 0;
+
+out:
+ vreq->submitting--;
+ if (!vreq->secs_pending) {
+ err = (err ? : vreq->error);
+ tapdisk_vbd_complete_vbd_request(vbd, vreq);
+ }
+
+ return err;
+
+fail:
+ vreq->status = BLKIF_RSP_ERROR;
+ goto out;
+}
+
+static int
+tapdisk_vbd_reissue_failed_requests(td_vbd_t *vbd)
+{
+ int err;
+ struct timeval now;
+ td_vbd_request_t *vreq, *tmp;
+
+ err = 0;
+ gettimeofday(&now, NULL);
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) {
+ if (vreq->secs_pending)
+ continue;
+
+ if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+ goto fail;
+
+ if (vreq->error != -EBUSY &&
+ now.tv_sec - vreq->last_try.tv_sec < TD_VBD_RETRY_INTERVAL)
+ continue;
+
+ if (vreq->num_retries >= TD_VBD_MAX_RETRIES) {
+ fail:
+ DBG(TLOG_INFO, "req %"PRIu64"retried %d times\n",
+ vreq->req.id, vreq->num_retries);
+ tapdisk_vbd_complete_vbd_request(vbd, vreq);
+ continue;
+ }
+
+ /*
+ * never fail due to too many retries if we are blocked on a
+ * dependency
+ */
+ if (vreq->blocked) {
+ vreq->blocked = 0;
+ } else {
+ vbd->retries++;
+ vreq->num_retries++;
+ }
+ vreq->error = 0;
+ vreq->status = BLKIF_RSP_OKAY;
+ DBG(TLOG_DBG, "retry #%d of req %"PRIu64", "
+ "sec 0x%08"PRIx64", nr_segs: %d\n", vreq->num_retries,
+ vreq->req.id, vreq->req.sector_number,
+ vreq->req.nr_segments);
+
+ err = tapdisk_vbd_issue_request(vbd, vreq);
+ if (err)
+ break;
+ }
+
+ if (list_empty(&vbd->failed_requests))
+ td_flag_clear(vbd->state, TD_VBD_RETRY_NEEDED);
+ else
+ td_flag_set(vbd->state, TD_VBD_RETRY_NEEDED);
+
+ return err;
+}
+
+static int
+tapdisk_vbd_issue_new_requests(td_vbd_t *vbd)
+{
+ int err;
+ td_vbd_request_t *vreq, *tmp;
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) {
+ err = tapdisk_vbd_issue_request(vbd, vreq);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_vbd_kill_requests(td_vbd_t *vbd)
+{
+ td_vbd_request_t *vreq, *tmp;
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) {
+ vreq->status = BLKIF_RSP_ERROR;
+ tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+ }
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) {
+ vreq->status = BLKIF_RSP_ERROR;
+ tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+ }
+
+ return 0;
+}
+
+int
+tapdisk_vbd_issue_requests(td_vbd_t *vbd)
+{
+ int err;
+
+ if (td_flag_test(vbd->state, TD_VBD_DEAD))
+ return tapdisk_vbd_kill_requests(vbd);
+
+ if (!tapdisk_vbd_queue_ready(vbd))
+ return -EAGAIN;
+
+ err = tapdisk_vbd_reissue_failed_requests(vbd);
+ if (err)
+ return err;
+
+ return tapdisk_vbd_issue_new_requests(vbd);
+}
+
+static void
+tapdisk_vbd_pull_ring_requests(td_vbd_t *vbd)
+{
+ int idx;
+ RING_IDX rp, rc;
+ td_ring_t *ring;
+ blkif_request_t *req;
+ td_vbd_request_t *vreq;
+
+ ring = &vbd->ring;
+ if (!ring->sring)
+ return;
+
+ rp = ring->fe_ring.sring->req_prod;
+ xen_rmb();
+
+ for (rc = ring->fe_ring.req_cons; rc != rp; rc++) {
+ req = RING_GET_REQUEST(&ring->fe_ring, rc);
+ ++ring->fe_ring.req_cons;
+
+ idx = req->id;
+ vreq = &vbd->request_list[idx];
+
+ ASSERT(list_empty(&vreq->next));
+ ASSERT(vreq->secs_pending == 0);
+
+ memcpy(&vreq->req, req, sizeof(blkif_request_t));
+ vbd->received++;
+ vreq->vbd = vbd;
+
+ tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+
+ DBG(TLOG_DBG, "%s: request %d \n", vbd->name, idx);
+ }
+}
+
+static int
+tapdisk_vbd_pause_ring(td_vbd_t *vbd)
+{
+ int err;
+
+ if (td_flag_test(vbd->state, TD_VBD_PAUSED))
+ return 0;
+
+ td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
+
+ err = tapdisk_vbd_quiesce_queue(vbd);
+ if (err) {
+ EPRINTF("%s: ring pause request on active queue\n", vbd->name);
+ return err;
+ }
+
+ tapdisk_vbd_close_vdi(vbd);
+
+ err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_PAUSE, 0);
+ if (err)
+ EPRINTF("%s: pause ioctl failed: %d\n", vbd->name, errno);
+ else {
+ td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+ td_flag_set(vbd->state, TD_VBD_PAUSED);
+ }
+
+ return err;
+}
+
+static int
+tapdisk_vbd_resume_ring(td_vbd_t *vbd)
+{
+ int i, err, type;
+ char message[BLKTAP2_MAX_MESSAGE_LEN];
+ const char *path;
+
+ memset(message, 0, sizeof(message));
+
+ if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
+ EPRINTF("%s: resume message for unpaused vbd\n", vbd->name);
+ return -EINVAL;
+ }
+
+ err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_REOPEN, &message);
+ if (err) {
+ EPRINTF("%s: resume ioctl failed: %d\n", vbd->name, errno);
+ return err;
+ }
+
+ err = tapdisk_parse_disk_type(message, &path, &type);
+ if (err) {
+ EPRINTF("%s: invalid resume string %s\n", vbd->name, message);
+ goto out;
+ }
+
+ free(vbd->name);
+ vbd->name = strdup(path);
+ if (!vbd->name) {
+ EPRINTF("resume malloc failed\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ tapdisk_vbd_start_queue(vbd);
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT);
+ if (err != -EIO)
+ break;
+
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+
+out:
+ if (!err) {
+ image_t image;
+ struct blktap2_params params;
+
+ memset(¶ms, 0, sizeof(params));
+ tapdisk_vbd_get_image_info(vbd, &image);
+
+ params.sector_size = image.secsize;
+ params.capacity = image.size;
+ snprintf(params.name, sizeof(params.name) - 1, "%s", message);
+
+ ioctl(vbd->ring.fd, BLKTAP2_IOCTL_SET_PARAMS, ¶ms);
+ td_flag_clear(vbd->state, TD_VBD_PAUSED);
+ }
+
+ ioctl(vbd->ring.fd, BLKTAP2_IOCTL_RESUME, err);
+ return err;
+}
+
+static int
+tapdisk_vbd_check_ring_message(td_vbd_t *vbd)
+{
+ if (!vbd->ring.sring)
+ return -EINVAL;
+
+ switch (vbd->ring.sring->pvt.tapif_user.msg) {
+ case 0:
+ return 0;
+
+ case BLKTAP2_RING_MESSAGE_PAUSE:
+ return tapdisk_vbd_pause_ring(vbd);
+
+ case BLKTAP2_RING_MESSAGE_RESUME:
+ return tapdisk_vbd_resume_ring(vbd);
+
+ case BLKTAP2_RING_MESSAGE_CLOSE:
+ return tapdisk_vbd_close(vbd);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static void
+tapdisk_vbd_ring_event(event_id_t id, char mode, void *private)
+{
+ td_vbd_t *vbd;
+
+ vbd = (td_vbd_t *)private;
+
+ tapdisk_vbd_pull_ring_requests(vbd);
+ tapdisk_vbd_issue_requests(vbd);
+
+ /* vbd may be destroyed after this call */
+ tapdisk_vbd_check_ring_message(vbd);
+}
+
+td_image_t *
+tapdisk_vbd_first_image(td_vbd_t *vbd)
+{
+ return list_entry(vbd->images.next, td_image_t, next);
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_VBD_H_
+#define _TAPDISK_VBD_H_
+
+#include <sys/time.h>
+#include <xenctrl.h>
+#include <xen/io/blkif.h>
+
+#include "tapdisk.h"
+#include "scheduler.h"
+#include "tapdisk-image.h"
+
+#define TD_VBD_MAX_RETRIES 100
+#define TD_VBD_RETRY_INTERVAL 1
+
+#define TD_VBD_DEAD 0x0001
+#define TD_VBD_CLOSED 0x0002
+#define TD_VBD_QUIESCE_REQUESTED 0x0004
+#define TD_VBD_QUIESCED 0x0008
+#define TD_VBD_PAUSE_REQUESTED 0x0010
+#define TD_VBD_PAUSED 0x0020
+#define TD_VBD_SHUTDOWN_REQUESTED 0x0040
+#define TD_VBD_LOCKING 0x0080
+#define TD_VBD_RETRY_NEEDED 0x0100
+#define TD_VBD_LOG_DROPPED 0x0200
+
+typedef struct td_ring td_ring_t;
+typedef struct td_vbd_request td_vbd_request_t;
+typedef struct td_vbd_driver_info td_vbd_driver_info_t;
+typedef struct td_vbd_handle td_vbd_t;
+typedef void (*td_vbd_cb_t) (void *, blkif_response_t *);
+
+struct td_ring {
+ int fd;
+ char *mem;
+ blkif_sring_t *sring;
+ blkif_back_ring_t fe_ring;
+ unsigned long vstart;
+};
+
+struct td_vbd_request {
+ blkif_request_t req;
+ int16_t status;
+
+ int error;
+ int blocked; /* blocked on a dependency */
+ int submitting;
+ int secs_pending;
+ int num_retries;
+ struct timeval last_try;
+
+ td_vbd_t *vbd;
+ struct list_head next;
+};
+
+struct td_vbd_driver_info {
+ char *params;
+ int type;
+ struct list_head next;
+};
+
+struct td_vbd_handle {
+ char *name;
+
+ td_uuid_t uuid;
+ int minor;
+
+ struct list_head driver_stack;
+
+ int storage;
+
+ uint8_t reopened;
+ uint8_t reactivated;
+ td_flag_t flags;
+ td_flag_t state;
+
+ struct list_head images;
+
+ struct list_head new_requests;
+ struct list_head pending_requests;
+ struct list_head failed_requests;
+ struct list_head completed_requests;
+
+ td_vbd_request_t request_list[MAX_REQUESTS];
+
+ td_ring_t ring;
+ event_id_t ring_event_id;
+
+ td_vbd_cb_t callback;
+ void *argument;
+
+ struct list_head next;
+
+ struct timeval ts;
+
+ uint64_t received;
+ uint64_t returned;
+ uint64_t kicked;
+ uint64_t secs_pending;
+ uint64_t retries;
+ uint64_t errors;
+};
+
+#define tapdisk_vbd_for_each_request(vreq, tmp, list) \
+ list_for_each_entry_safe((vreq), (tmp), (list), next)
+
+#define tapdisk_vbd_for_each_image(vbd, image, tmp) \
+ list_for_each_entry_safe((image), (tmp), &(vbd)->images, next)
+
+static inline void
+tapdisk_vbd_move_request(td_vbd_request_t *vreq, struct list_head *dest)
+{
+ list_del(&vreq->next);
+ INIT_LIST_HEAD(&vreq->next);
+ list_add_tail(&vreq->next, dest);
+}
+
+static inline void
+tapdisk_vbd_add_image(td_vbd_t *vbd, td_image_t *image)
+{
+ list_add_tail(&image->next, &vbd->images);
+}
+
+static inline int
+tapdisk_vbd_is_last_image(td_vbd_t *vbd, td_image_t *image)
+{
+ return list_is_last(&image->next, &vbd->images);
+}
+
+td_image_t *
+tapdisk_vbd_first_image(td_vbd_t *vbd);
+
+static inline td_image_t *
+tapdisk_vbd_last_image(td_vbd_t *vbd)
+{
+ return list_entry(vbd->images.prev, td_image_t, next);
+}
+
+static inline td_image_t *
+tapdisk_vbd_next_image(td_image_t *image)
+{
+ return list_entry(image->next.next, td_image_t, next);
+}
+
+td_vbd_t *tapdisk_vbd_create(td_uuid_t);
+int tapdisk_vbd_initialize(td_uuid_t);
+void tapdisk_vbd_set_callback(td_vbd_t *, td_vbd_cb_t, void *);
+int tapdisk_vbd_parse_stack(td_vbd_t *vbd, const char *path);
+int tapdisk_vbd_open(td_vbd_t *, const char *, uint16_t,
+ uint16_t, int, const char *, td_flag_t);
+int tapdisk_vbd_close(td_vbd_t *);
+void tapdisk_vbd_free(td_vbd_t *);
+void tapdisk_vbd_free_stack(td_vbd_t *);
+
+int tapdisk_vbd_open_stack(td_vbd_t *, uint16_t, td_flag_t);
+int tapdisk_vbd_open_vdi(td_vbd_t *, const char *,
+ uint16_t, uint16_t, td_flag_t);
+void tapdisk_vbd_close_vdi(td_vbd_t *);
+
+int tapdisk_vbd_attach(td_vbd_t *, const char *, int);
+void tapdisk_vbd_detach(td_vbd_t *);
+
+void tapdisk_vbd_forward_request(td_request_t);
+
+int tapdisk_vbd_get_image_info(td_vbd_t *, image_t *);
+int tapdisk_vbd_queue_ready(td_vbd_t *);
+int tapdisk_vbd_retry_needed(td_vbd_t *);
+int tapdisk_vbd_quiesce_queue(td_vbd_t *);
+int tapdisk_vbd_start_queue(td_vbd_t *);
+int tapdisk_vbd_issue_requests(td_vbd_t *);
+int tapdisk_vbd_kill_queue(td_vbd_t *);
+int tapdisk_vbd_pause(td_vbd_t *);
+int tapdisk_vbd_resume(td_vbd_t *, const char *, uint16_t);
+int tapdisk_vbd_kick(td_vbd_t *);
+void tapdisk_vbd_check_state(td_vbd_t *);
+void tapdisk_vbd_check_progress(td_vbd_t *);
+void tapdisk_vbd_debug(td_vbd_t *);
+
+void tapdisk_vbd_complete_vbd_request(td_vbd_t *, td_vbd_request_t *);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Some notes on the tap_disk interface:
+ *
+ * tap_disk aims to provide a generic interface to easily implement new
+ * types of image accessors. The structure-of-function-calls is similar
+ * to disk interfaces used in qemu/denali/etc, with the significant
+ * difference being the expectation of asynchronous rather than synchronous
+ * I/O. The asynchronous interface is intended to allow lots of requests to
+ * be pipelined through a disk, without the disk requiring any of its own
+ * threads of control. As such, a batch of requests is delivered to the disk
+ * using:
+ *
+ * td_queue_[read,write]()
+ *
+ * and passing in a completion callback, which the disk is responsible for
+ * tracking. Disks should transform these requests as necessary and return
+ * the resulting iocbs to tapdisk using td_prep_[read,write]() and
+ * td_queue_tiocb().
+ *
+ * NOTE: tapdisk uses the number of sectors submitted per request as a
+ * ref count. Plugins must use the callback function to communicate the
+ * completion -- or error -- of every sector submitted to them.
+ *
+ * td_get_parent_id returns:
+ * 0 if parent id successfully retrieved
+ * TD_NO_PARENT if no parent exists
+ * -errno on error
+ */
+
+#ifndef _TAPDISK_H_
+#define _TAPDISK_H_
+
+#include <time.h>
+#include <stdint.h>
+
+#include "list.h"
+#include "blktaplib.h"
+#include "tapdisk-log.h"
+#include "tapdisk-utils.h"
+
+#ifdef MEMSHR
+#include "memshr.h"
+#endif
+
+#define DPRINTF(_f, _a...) syslog(LOG_INFO, _f, ##_a)
+#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a)
+#define PERROR(_f, _a...) EPRINTF(_f ": %s", ##_a, strerror(errno))
+
+#define MAX_SEGMENTS_PER_REQ 11
+#define SECTOR_SHIFT 9
+#define DEFAULT_SECTOR_SIZE 512
+
+#define TAPDISK_DATA_REQUESTS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
+
+//#define BLK_NOT_ALLOCATED (-99)
+#define TD_NO_PARENT 1
+
+#define MAX_RAMDISK_SIZE 1024000 /*500MB disk limit*/
+
+#define TD_OP_READ 0
+#define TD_OP_WRITE 1
+
+#define TD_OPEN_QUIET 0x00001
+#define TD_OPEN_QUERY 0x00002
+#define TD_OPEN_RDONLY 0x00004
+#define TD_OPEN_STRICT 0x00008
+#define TD_OPEN_SHAREABLE 0x00010
+#define TD_OPEN_ADD_CACHE 0x00020
+#define TD_OPEN_VHD_INDEX 0x00040
+#define TD_OPEN_LOG_DIRTY 0x00080
+
+#define TD_CREATE_SPARSE 0x00001
+#define TD_CREATE_MULTITYPE 0x00002
+
+#define td_flag_set(word, flag) ((word) |= (flag))
+#define td_flag_clear(word, flag) ((word) &= ~(flag))
+#define td_flag_test(word, flag) ((word) & (flag))
+
+typedef uint16_t td_uuid_t;
+typedef uint32_t td_flag_t;
+typedef uint64_t td_sector_t;
+typedef struct td_disk_id td_disk_id_t;
+typedef struct td_disk_info td_disk_info_t;
+typedef struct td_request td_request_t;
+typedef struct td_driver_handle td_driver_t;
+typedef struct td_image_handle td_image_t;
+
+struct td_disk_id {
+ char *name;
+ int drivertype;
+};
+
+struct td_disk_info {
+ td_sector_t size;
+ uint64_t sector_size;
+ uint32_t info;
+};
+
+struct td_request {
+ int op;
+ char *buf;
+ td_sector_t sec;
+ int secs;
+
+ uint8_t blocked; /* blocked on a dependency */
+
+ td_image_t *image;
+
+ void * /*td_callback_t*/ cb;
+ void *cb_data;
+
+ uint64_t id;
+ int sidx;
+ void *private;
+
+#ifdef MEMSHR
+ share_tuple_t memshr_hnd;
+#endif
+};
+
+/*
+ * Prototype of the callback to activate as requests complete.
+ */
+typedef void (*td_callback_t)(td_request_t, int);
+
+/*
+ * Structure describing the interface to a virtual disk implementation.
+ * See note at the top of this file describing this interface.
+ */
+struct tap_disk {
+ const char *disk_type;
+ td_flag_t flags;
+ int private_data_size;
+ int (*td_open) (td_driver_t *, const char *, td_flag_t);
+ int (*td_close) (td_driver_t *);
+ int (*td_get_parent_id) (td_driver_t *, td_disk_id_t *);
+ int (*td_validate_parent) (td_driver_t *, td_driver_t *, td_flag_t);
+ void (*td_queue_read) (td_driver_t *, td_request_t);
+ void (*td_queue_write) (td_driver_t *, td_request_t);
+ void (*td_debug) (td_driver_t *);
+};
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#ifdef MEMSHR
+#include <memshr.h>
+#endif
+
+#include "tapdisk.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+#include "tapdisk-control.h"
+
+static void
+usage(const char *app, int err)
+{
+ fprintf(stderr, "usage: %s [-D] <-u uuid> <-c control socket>\n", app);
+ exit(err);
+}
+
+int
+main(int argc, char *argv[])
+{
+ char *control;
+ int c, err, nodaemon;
+
+ control = NULL;
+ nodaemon = 0;
+
+ while ((c = getopt(argc, argv, "s:Dh")) != -1) {
+ switch (c) {
+ case 'D':
+ nodaemon = 1;
+ break;
+ case 'h':
+ usage(argv[0], 0);
+ break;
+ case 's':
+#ifdef MEMSHR
+ memshr_set_domid(atoi(optarg));
+#else
+ fprintf(stderr, "MEMSHR support not compiled in.\n");
+ exit(EXIT_FAILURE);
+#endif
+ break;
+ default:
+ usage(argv[0], EINVAL);
+ }
+ }
+
+ if (optind != argc)
+ usage(argv[0], EINVAL);
+
+ if (chdir("/")) {
+ DPRINTF("failed to chdir(/): %d\n", errno);
+ err = 1;
+ goto out;
+ }
+
+ tapdisk_start_logging("tapdisk2");
+
+ err = tapdisk_server_init();
+ if (err) {
+ DPRINTF("failed to initialize server: %d\n", err);
+ goto out;
+ }
+
+ if (!nodaemon) {
+ err = daemon(0, 1);
+ if (err) {
+ DPRINTF("failed to daemonize: %d\n", errno);
+ goto out;
+ }
+ }
+
+ err = tapdisk_control_open(&control);
+ if (err) {
+ DPRINTF("failed to open control socket: %d\n", err);
+ goto out;
+ }
+
+ fprintf(stdout, "%s\n", control);
+ fflush(stdout);
+
+ if (!nodaemon) {
+ int fd;
+
+ fd = open("/dev/null", O_RDWR);
+ if (fd != -1) {
+ dup2(fd, STDIN_FILENO);
+ dup2(fd, STDOUT_FILENO);
+ dup2(fd, STDERR_FILENO);
+ if (fd > 2)
+ close(fd);
+ }
+ }
+
+ err = tapdisk_server_complete();
+ if (err) {
+ DPRINTF("failed to complete server: %d\n", err);
+ goto out;
+ }
+
+ err = tapdisk_server_run();
+
+out:
+ tapdisk_control_close();
+ tapdisk_stop_logging();
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+#include "tapdisk-utils.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stdout, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+typedef enum {
+ TD_FIELD_HIDDEN = 0,
+ TD_FIELD_INVALID = 1
+} td_field_t;
+
+struct vdi_field {
+ char *name;
+ td_field_t id;
+};
+
+static struct vdi_field td_vdi_fields[TD_FIELD_INVALID] = {
+ { .id = TD_FIELD_HIDDEN, .name = "hidden" }
+};
+
+typedef enum {
+ TD_CMD_CREATE = 0,
+ TD_CMD_SNAPSHOT,
+/* TD_CMD_COALESCE, */
+ TD_CMD_QUERY,
+/* TD_CMD_RESIZE, */
+ TD_CMD_SET,
+/* TD_CMD_REPAIR, */
+/* TD_CMD_FILL, */
+/* TD_CMD_READ, */
+ TD_CMD_INVALID,
+} td_command_t;
+
+struct command {
+ td_command_t id;
+ char *name;
+ int needs_type;
+};
+
+struct command commands[TD_CMD_INVALID] = {
+ { .id = TD_CMD_CREATE, .name = "create", .needs_type = 1 },
+ { .id = TD_CMD_SNAPSHOT, .name = "snapshot", .needs_type = 1 },
+/* { .id = TD_CMD_COALESCE, .name = "coalesce", .needs_type = 1 }, */
+ { .id = TD_CMD_QUERY, .name = "query", .needs_type = 1 },
+/* { .id = TD_CMD_RESIZE, .name = "resize", .needs_type = 1 }, */
+ { .id = TD_CMD_SET, .name = "set", .needs_type = 1 },
+/* { .id = TD_CMD_REPAIR, .name = "repair", .needs_type = 1 }, */
+/* { .id = TD_CMD_FILL, .name = "fill", .needs_type = 1 }, */
+/* { .id = TD_CMD_READ, .name = "read", .needs_type = 1 }, */
+};
+
+typedef enum {
+ TD_TYPE_VHD = 0,
+ TD_TYPE_AIO,
+ TD_TYPE_INVALID,
+} td_disk_t;
+
+const char *td_disk_types[TD_TYPE_INVALID] = {
+ "vhd",
+ "aio",
+};
+
+#define print_commands() \
+ do { \
+ int i; \
+ fprintf(stderr, "COMMAND := { "); \
+ fprintf(stderr, "%s", commands[0].name); \
+ for (i = 1; i < TD_CMD_INVALID; i++) \
+ fprintf(stderr, " | %s", commands[i].name); \
+ fprintf(stderr, " }\n"); \
+ } while (0)
+
+#define print_disk_types() \
+ do { \
+ int i; \
+ fprintf(stderr, "TYPE := { "); \
+ fprintf(stderr, "%s", td_disk_types[0]); \
+ for (i = 1; i < TD_TYPE_INVALID; i++) \
+ fprintf(stderr, " | %s", td_disk_types[i]); \
+ fprintf(stderr, " }\n"); \
+ } while (0);
+
+#define print_field_names() \
+ do { \
+ int i; \
+ fprintf(stderr, "FIELD := { "); \
+ fprintf(stderr, "%s", td_vdi_fields[0].name); \
+ for (i = 1; i < TD_FIELD_INVALID; i++) \
+ fprintf(stderr, " | %s", td_vdi_fields[i].name); \
+ fprintf(stderr, " }\n"); \
+ } while (0)
+
+void
+help(void)
+{
+ fprintf(stderr, "Tapdisk Utilities: v1.0.0\n");
+ fprintf(stderr, "usage: td-util COMMAND [TYPE] [OPTIONS]\n");
+ print_commands();
+ print_disk_types();
+ exit(-1);
+}
+
+struct command *
+get_command(char *command)
+{
+ int i;
+
+ for (i = 0; i < TD_CMD_INVALID; i++)
+ if (!strcmp(command, commands[i].name))
+ return &commands[i];
+
+ return NULL;
+}
+
+struct vdi_field *
+get_field(char *field)
+{
+ int i;
+
+ for (i = 0; i < TD_FIELD_INVALID; i++)
+ if (!strcmp(field, td_vdi_fields[i].name))
+ return &td_vdi_fields[i];
+
+ return NULL;
+}
+
+int
+get_driver_type(char *type)
+{
+ int i;
+
+ if (strnlen(type, 25) >= 25)
+ return -ENAMETOOLONG;
+
+ for (i = 0; i < TD_TYPE_INVALID; i++)
+ if (!strcmp(type, td_disk_types[i]))
+ return i;
+
+ return -TD_TYPE_INVALID;
+}
+
+int
+td_create(int type, int argc, char *argv[])
+{
+ ssize_t mb;
+ uint64_t size;
+ char *name, *buf;
+ int c, i, fd, sparse = 1, fixedsize = 0;
+
+ while ((c = getopt(argc, argv, "hrb")) != -1) {
+ switch(c) {
+ case 'r':
+ sparse = 0;
+ break;
+ case 'b':
+ fixedsize = 1;
+ break;
+ default:
+ fprintf(stderr, "Unknown option %c\n", (char)c);
+ case 'h':
+ goto usage;
+ }
+ }
+
+ if (optind != (argc - 2))
+ goto usage;
+
+ mb = 1 << 20;
+ size = atoi(argv[optind++]);
+ size = size << 20;
+ name = argv[optind];
+
+ if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ fprintf(stderr, "Device name too long\n");
+ return ENAMETOOLONG;
+ }
+
+ if (type == TD_TYPE_VHD) {
+ int cargc = 0;
+ char sbuf[32], *cargv[10];
+
+ size >>= 20;
+
+ memset(cargv, 0, sizeof(cargv));
+ snprintf(sbuf, sizeof(sbuf) - 1, "%"PRIu64, size);
+ cargv[cargc++] = "create";
+ cargv[cargc++] = "-n";
+ cargv[cargc++] = name;
+ cargv[cargc++] = "-s";
+ cargv[cargc++] = sbuf;
+ if (!sparse)
+ cargv[cargc++] = "-r";
+ if (fixedsize)
+ cargv[cargc++] = "-b";
+
+ return vhd_util_create(cargc, cargv);
+ }
+
+ /* generic create */
+ if (sparse) {
+ fprintf(stderr, "Cannot create sparse %s image\n",
+ td_disk_types[type]);
+ return EINVAL;
+ }
+
+ buf = calloc(1, mb);
+ if (!buf)
+ return ENOMEM;
+
+ fd = open(name, O_WRONLY | O_DIRECT | O_CREAT | O_TRUNC, 0644);
+ if (fd == -1) {
+ free(buf);
+ return errno;
+ }
+
+ size >>= 20;
+ for (i = 0; i < size; i++)
+ if (write(fd, buf, mb) != mb) {
+ close(fd);
+ unlink(name);
+ free(buf);
+ return EIO;
+ }
+
+ close(fd);
+ free(buf);
+ return 0;
+
+ usage:
+ fprintf(stderr, "usage: td-util create %s [-h help] [-r reserve] "
+ "[-b file_is_fixed_size] <SIZE(MB)> <FILENAME>\n",
+ td_disk_types[type]);
+ return EINVAL;
+}
+
+int
+td_snapshot(int type, int argc, char *argv[])
+{
+ char *cargv[10];
+ int c, err, cargc;
+ struct stat stats;
+ char *name, *backing, *limit = NULL;
+ int fixedsize = 0, rawparent = 0;
+
+ if (type != TD_TYPE_VHD) {
+ fprintf(stderr, "Cannot create snapshot of %s image type\n",
+ td_disk_types[type]);
+ return EINVAL;
+ }
+
+ while ((c = getopt(argc, argv, "hbml:")) != -1) {
+ switch(c) {
+ case 'b':
+ fixedsize = 1;
+ break;
+ case 'm':
+ rawparent = 1;
+ break;
+ case 'l':
+ limit = optarg;
+ break;
+ case 'h':
+ err = 0;
+ goto usage;
+ default:
+ err = EINVAL;
+ goto usage;
+ }
+ }
+
+ if (optind != (argc - 2)) {
+ err = EINVAL;
+ goto usage;
+ }
+
+ name = argv[optind++];
+ backing = argv[optind++];
+
+ if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN ||
+ strnlen(backing, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ fprintf(stderr, "Device name too long\n");
+ return ENAMETOOLONG;
+ }
+
+ if (stat(backing, &stats) == -1) {
+ fprintf(stderr, "File %s not found\n", backing);
+ return errno;
+ }
+
+ cargc = 0;
+ memset(cargv, 0, sizeof(cargv));
+ cargv[cargc++] = "snapshot";
+ cargv[cargc++] = "-n";
+ cargv[cargc++] = name;
+ cargv[cargc++] = "-p";
+ cargv[cargc++] = backing;
+ if (fixedsize)
+ cargv[cargc++] = "-b";
+ if (rawparent)
+ cargv[cargc++] = "-m";
+ if (limit) {
+ cargv[cargc++] = "-l";
+ cargv[cargc++] = limit;
+ }
+ return vhd_util_snapshot(cargc, cargv);
+
+ usage:
+ fprintf(stderr, "usage: td-util snapshot %s [-h help] [-m parent_raw] "
+ "[-b file_is_fixed_size] [-l snapshot depth limit] "
+ "<FILENAME> <BACKING_FILENAME>\n", td_disk_types[type]);
+ return err;
+}
+
+int
+td_coalesce(int type, int argc, char *argv[])
+{
+ int c, ret, cargc;
+ char *name, *pname, *cargv[3];
+
+ if (type != TD_TYPE_VHD) {
+ fprintf(stderr, "Cannot create snapshot of %s image type\n",
+ td_disk_types[type]);
+ return EINVAL;
+ }
+
+ while ((c = getopt(argc, argv, "h")) != -1) {
+ switch(c) {
+ default:
+ fprintf(stderr, "Unknown option %c\n", (char)c);
+ case 'h':
+ goto usage;
+ }
+ }
+
+ if (optind != (argc - 1))
+ goto usage;
+
+ name = argv[optind++];
+
+ if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ fprintf(stderr, "Device name too long\n");
+ return ENAMETOOLONG;
+ }
+
+ cargc = 0;
+ memset(cargv, 0, sizeof(cargv));
+ cargv[cargc++] = "coalesce";
+ cargv[cargc++] = "-n";
+ cargv[cargc++] = name;
+ ret = vhd_util_coalesce(cargc, cargv);
+ if (ret)
+ printf("coalesce failed: %d\n", ret);
+
+ return ret;
+
+ usage:
+ fprintf(stderr, "usage: td-util coalesce %s [-h help] "
+ "<FILENAME>\n", td_disk_types[type]);
+ return EINVAL;
+}
+
+int
+td_query(int type, int argc, char *argv[])
+{
+ char *name;
+ int c, size = 0, parent = 0, fields = 0, depth = 0, err = 0;
+
+ while ((c = getopt(argc, argv, "hvpfd")) != -1) {
+ switch(c) {
+ case 'v':
+ size = 1;
+ break;
+ case 'p':
+ parent = 1;
+ break;
+ case 'f':
+ fields = 1;
+ break;
+ case 'd':
+ depth = 1;
+ break;
+ case 'h':
+ err = 0;
+ goto usage;
+ default:
+ err = EINVAL;
+ goto usage;
+ }
+ }
+
+ if (optind != (argc - 1)) {
+ err = EINVAL;
+ goto usage;
+ }
+
+ name = argv[optind++];
+
+ if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ fprintf(stderr, "Device name too long\n");
+ return ENAMETOOLONG;
+ }
+
+ if (type == TD_TYPE_VHD) {
+ vhd_context_t vhd;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+ if (err) {
+ printf("failed opening %s: %d\n", name, err);
+ return err;
+ }
+
+ if (size)
+ printf("%"PRIu64"\n", vhd.footer.curr_size >> 20);
+
+ if (parent) {
+ if (vhd.footer.type != HD_TYPE_DIFF)
+ printf("%s has no parent\n", name);
+ else {
+ char *pname;
+
+ err = vhd_parent_locator_get(&vhd, &pname);
+ if (err)
+ printf("failed getting parent: %d\n",
+ err);
+ else {
+ printf("%s\n", pname);
+ free(pname);
+ }
+ }
+ }
+
+ if (fields) {
+ int ret, hidden;
+
+ ret = vhd_hidden(&vhd, &hidden);
+ if (ret) {
+ printf("failed checking 'hidden' field: %d\n",
+ ret);
+ err = (err ? : ret);
+ } else
+ printf("%s: %d\n",
+ td_vdi_fields[TD_FIELD_HIDDEN].name,
+ hidden);
+ }
+
+ if (depth) {
+ int ret, length;
+
+ ret = vhd_chain_depth(&vhd, &length);
+ if (ret)
+ printf("error checking chain depth: %d\n", ret);
+ else
+ printf("chain depth: %d\n", length);
+
+ err = (err ? : ret);
+ }
+
+ vhd_close(&vhd);
+
+ } else if (type == TD_TYPE_AIO) {
+ if (size) {
+ int fd;
+ uint64_t secs;
+ uint32_t ssize;
+
+ fd = open(name, O_RDONLY | O_LARGEFILE);
+ if (fd == -1) {
+ printf("failed opening %s: %d\n", name, errno);
+ return -errno;
+ }
+
+ err = tapdisk_get_image_size(fd, &secs, &ssize);
+ close(fd);
+
+ if (err) {
+ printf("failed getting size for %s: %d\n:",
+ name, err);
+ return err;
+ }
+
+ printf("%"PRIu64"\n", secs >> 11);
+ }
+
+ if (parent)
+ printf("%s has no parent\n", name);
+
+ if (fields) {
+ int i;
+
+ for (i = 0; i < TD_FIELD_INVALID; i++)
+ printf("%s: 0\n", td_vdi_fields[i].name);
+ }
+ }
+
+ return err;
+
+ usage:
+ fprintf(stderr, "usage: td-util query %s [-h help] [-v virtsize] "
+ "[-p parent] [-f fields] <FILENAME>\n", td_disk_types[type]);
+ return err;
+}
+
+int
+td_set_field(int type, int argc, char *argv[])
+{
+ int ret, i, c, cargc;
+ struct vdi_field *field;
+ char *name, *value, *cargv[7];
+
+ if (type != TD_TYPE_VHD) {
+ fprintf(stderr, "Cannot set fields of %s images\n",
+ td_disk_types[type]);
+ return EINVAL;
+ }
+
+ while ((c = getopt(argc, argv, "h")) != -1) {
+ switch(c) {
+ default:
+ fprintf(stderr, "Unknown option %c\n", (char)c);
+ case 'h':
+ goto usage;
+ }
+ }
+
+ if (optind != (argc - 3))
+ goto usage;
+
+ name = argv[optind++];
+
+ field = get_field(argv[optind]);
+ if (!field || field->id != TD_FIELD_HIDDEN) {
+ fprintf(stderr, "Invalid field %s\n", argv[optind]);
+ goto usage;
+ }
+
+ value = argv[++optind];
+
+ cargc = 0;
+ memset(cargv, 0, sizeof(cargv));
+ cargv[cargc++] = "set";
+ cargv[cargc++] = "-n";
+ cargv[cargc++] = name;
+ cargv[cargc++] = "-f";
+ cargv[cargc++] = field->name;
+ cargv[cargc++] = "-v";
+ cargv[cargc++] = value;
+ return vhd_util_set_field(cargc, cargv);
+
+ usage:
+ fprintf(stderr, "usage: td-util set %s [-h help] "
+ "<FILENAME> <FIELD> <VALUE>\n", td_disk_types[type]);
+ print_field_names();
+ return EINVAL;
+}
+
+int
+main(int argc, char *argv[])
+{
+ char **cargv;
+ struct command *cmd;
+ int cargc, i, type = -1, ret = 0;
+
+#ifdef CORE_DUMP
+ struct rlimit rlim;
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+ if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+ fprintf(stderr, "setrlimit failed: %d\n", errno);
+#endif
+
+ if (argc < 2)
+ help();
+
+ cargc = argc - 1;
+ cmd = get_command(argv[1]);
+ if (!cmd) {
+ fprintf(stderr, "invalid COMMAND %s\n", argv[1]);
+ help();
+ }
+
+ if (cmd->needs_type) {
+ if (argc < 3) {
+ fprintf(stderr, "td-util %s requires a TYPE\n",
+ cmd->name);
+ print_disk_types();
+ exit(-1);
+ }
+
+ type = get_driver_type(argv[2]);
+ if (type < 0) {
+ fprintf(stderr, "invalid TYPE '%s'.\n", argv[2]);
+ print_disk_types();
+ exit(-1);
+ }
+ --cargc;
+ }
+
+ cargv = malloc(sizeof(char *) * cargc);
+ if (!cargv)
+ exit(ENOMEM);
+
+ cargv[0] = cmd->name;
+ for (i = 1; i < cargc; i++)
+ cargv[i] = argv[i + (argc - cargc)];
+
+ switch(cmd->id) {
+ case TD_CMD_CREATE:
+ ret = td_create(type, cargc, cargv);
+ break;
+ case TD_CMD_SNAPSHOT:
+ ret = td_snapshot(type, cargc, cargv);
+ break;
+/*
+ case TD_CMD_COALESCE:
+ ret = td_coalesce(type, cargc, cargv);
+ break;
+*/
+ case TD_CMD_QUERY:
+ ret = td_query(type, cargc, cargv);
+ break;
+/*
+ case TD_CMD_RESIZE:
+ ret = td_resize(type, cargc, cargv);
+ break;
+*/
+ case TD_CMD_SET:
+ ret = td_set_field(type, cargc, cargv);
+ break;
+/*
+ case TD_CMD_REPAIR:
+ ret = td_repair(type, cargc, cargv);
+ break;
+ case TD_CMD_FILL:
+ ret = td_fill(type, cargc, cargv);
+ break;
+ case TD_CMD_READ:
+ ret = td_read(type, cargc, cargv);
+ break;
+*/
+ default:
+ case TD_CMD_INVALID:
+ ret = EINVAL;
+ break;
+ }
+
+ free(cargv);
+
+ return (ret >= 0 ? ret : -ret);
+}
--- /dev/null
+#!/bin/bash
+
+usage () { echo "USAGE: xmsnap <VM ID> <Backing File>"; }
+
+#
+# Check Usage
+#
+if [ -n "$1" ]
+then
+ vmid=$1
+else
+ usage
+ exit 1
+fi
+
+if [ -n "$2" ]
+then
+ target=$2
+else
+ usage
+ exit 1
+fi
+
+if [ -e "$target" ]
+then
+ echo "Creating snapshot of file $target for VM $vmid."
+else
+ usage
+ echo "File $target not found."
+ exit 1
+fi
+
+#
+# Find the snapshot name
+#
+directory=`dirname "$target"`
+target=`basename "$target"`
+
+let maxidx=0
+if [ -e $directory/${target}.snap1 ]
+then
+ for idx in $(ls $directory/${target}.snap*)
+ do
+ let idx=${idx#$directory/${target}.snap}
+ if [ "$idx" -gt "$maxidx" ]
+ then
+ let maxidx=$idx
+ fi
+ done
+fi
+
+snap=${target}.snap`expr $maxidx + 1`
+
+#
+# Pause VM
+#
+xm pause $vmid
+if [ "$?" -ne "0" ]; then
+ exit 1
+fi
+
+
+#
+# Snap and reposition the files
+#
+mv $directory/$target $directory/$snap
+if [ "$?" -ne "0" ]; then
+ exit 1
+fi
+
+qcow-create 0 $directory/$target $directory/$snap
+
+#
+# Unpause
+#
+xm unpause $vmid
+
+exit
\ No newline at end of file
--- /dev/null
+XEN_ROOT := $(CURDIR)/../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+.PHONY: all
+all:
+
+.PHONY: install
+install:
+ $(INSTALL_DIR) -p $(DESTDIR)$(includedir)
+
+
+.PHONY: clean
+clean:
+ @:
+
+.PHONY: distclean
+distclean: clean
--- /dev/null
+/* $OpenBSD: atomicio.h,v 1.6 2005/05/24 17:32:43 avsm Exp $ */
+
+/*
+ * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t atomicio(ssize_t (*)(int, void *, size_t), int, void *, size_t);
+
+#define vwrite (ssize_t (*)(int, void *, size_t))write
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _BLKTAP_2_H_
+#define _BLKTAP_2_H_
+
+#define BLKTAP2_MAX_MESSAGE_LEN 256
+
+#define BLKTAP2_RING_MESSAGE_PAUSE 1
+#define BLKTAP2_RING_MESSAGE_RESUME 2
+#define BLKTAP2_RING_MESSAGE_CLOSE 3
+
+#define BLKTAP2_IOCTL_KICK_FE 1
+#define BLKTAP2_IOCTL_ALLOC_TAP 200
+#define BLKTAP2_IOCTL_FREE_TAP 201
+#define BLKTAP2_IOCTL_CREATE_DEVICE 202
+#define BLKTAP2_IOCTL_SET_PARAMS 203
+#define BLKTAP2_IOCTL_PAUSE 204
+#define BLKTAP2_IOCTL_REOPEN 205
+#define BLKTAP2_IOCTL_RESUME 206
+
+#define BLKTAP2_SYSFS_DIR "/sys/class/blktap2"
+#define BLKTAP2_CONTROL_NAME "blktap-control"
+#define BLKTAP2_CONTROL_DIR "/var/run/"BLKTAP2_CONTROL_NAME
+#define BLKTAP2_CONTROL_SOCKET "ctl"
+#define BLKTAP2_DIRECTORY "/dev/xen/blktap-2"
+#define BLKTAP2_CONTROL_DEVICE BLKTAP2_DIRECTORY"/control"
+#define BLKTAP2_RING_DEVICE BLKTAP2_DIRECTORY"/blktap"
+#define BLKTAP2_IO_DEVICE BLKTAP2_DIRECTORY"/tapdev"
+
+struct blktap2_handle {
+ unsigned int ring;
+ unsigned int device;
+ unsigned int minor;
+};
+
+struct blktap2_params {
+ char name[BLKTAP2_MAX_MESSAGE_LEN];
+ unsigned long long capacity;
+ unsigned long sector_size;
+};
+
+#endif
--- /dev/null
+/* blktaplib.h
+ *
+ * Blktap library userspace code.
+ *
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __BLKTAPLIB_H__
+#define __BLKTAPLIB_H__
+
+#include <syslog.h>
+#include <sys/time.h>
+#include <xenctrl.h>
+#include <xen/io/blkif.h>
+
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, XC_PAGE_SIZE)
+
+/* size of the extra VMA area to map in attached pages. */
+#define BLKTAP_VMA_PAGES BLK_RING_SIZE
+
+/* blktap IOCTLs: These must correspond with the blktap driver ioctls */
+#define BLKTAP_IOCTL_KICK_FE 1
+#define BLKTAP_IOCTL_KICK_BE 2
+#define BLKTAP_IOCTL_SETMODE 3
+#define BLKTAP_IOCTL_SENDPID 4
+#define BLKTAP_IOCTL_NEWINTF 5
+#define BLKTAP_IOCTL_MINOR 6
+#define BLKTAP_IOCTL_MAJOR 7
+#define BLKTAP_QUERY_ALLOC_REQS 8
+#define BLKTAP_IOCTL_FREEINTF 9
+#define BLKTAP_IOCTL_PRINT_IDXS 100
+#define BLKTAP_IOCTL_BACKDEV_SETUP 200
+
+#define PRIO_SPECIAL_IO -9999
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
+#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
+#define BLKTAP_MODE_INTERCEPT_FE 0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE 0x00000002
+
+#define BLKTAP_MODE_INTERPOSE \
+ (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+ return (
+ ( arg == BLKTAP_MODE_PASSTHROUGH ) ||
+ ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+ ( arg == BLKTAP_MODE_INTERPOSE ) );
+}
+
+#define MAX_REQUESTS BLK_RING_SIZE
+
+#define BLKTAP_IOCTL_KICK 1
+#define MAX_PENDING_REQS BLK_RING_SIZE
+#define BLKTAP_DEV_DIR "/dev/xen"
+#define BLKTAP_DEV_NAME "blktap"
+#define BACKDEV_NAME "backdev"
+#define BLKTAP_DEV_MINOR 0
+#define BLKTAP_CTRL_DIR "/var/run/tap"
+
+extern int blktap_major;
+
+#define BLKTAP_RING_PAGES 1 /* Front */
+#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES)
+
+struct blkif;
+struct blkif_info;
+
+typedef struct {
+ blkif_request_t req;
+ int submitting;
+ int secs_pending;
+ int16_t status;
+ int num_retries;
+ struct timeval last_try;
+} pending_req_t;
+
+typedef struct blkif {
+ domid_t domid;
+ long int handle;
+
+ long int pdev;
+ long int readonly;
+
+ enum { DISCONNECTED, DISCONNECTING, CONNECTED } state;
+
+ struct blkif_ops *ops;
+ struct blkif *hash_next;
+
+ void *prv; /* device-specific data */
+ struct blkif_info *info; /*Image parameter passing */
+ pending_req_t pending_list[MAX_REQUESTS];
+ int devnum;
+ int fds[2];
+ int be_id;
+ char *backend_path;
+ int major;
+ int minor;
+ pid_t tappid;
+ int drivertype;
+ uint16_t cookie;
+ int err;
+} blkif_t;
+
+typedef struct blkif_info {
+ char *params;
+ int readonly;
+ int storage;
+} blkif_info_t;
+
+typedef struct tapdev_info {
+ int fd;
+ char *mem;
+ blkif_sring_t *sring;
+ blkif_back_ring_t fe_ring;
+ unsigned long vstart;
+ blkif_t *blkif;
+} tapdev_info_t;
+
+typedef struct domid_translate {
+ unsigned short domid;
+ unsigned short busid;
+} domid_translate_t ;
+
+typedef struct image {
+ unsigned long long size;
+ unsigned long secsize;
+ unsigned int info;
+} image_t;
+
+typedef struct msg_hdr {
+ uint16_t type;
+ uint16_t len;
+ uint16_t drivertype;
+ uint16_t cookie;
+} msg_hdr_t;
+
+typedef struct msg_params {
+ uint8_t readonly;
+ int path_off;
+ int path_len;
+ int storage;
+} msg_params_t;
+
+typedef struct msg_newdev {
+ uint8_t devnum;
+ uint16_t domid;
+} msg_newdev_t;
+
+typedef struct msg_pid {
+ pid_t pid;
+} msg_pid_t;
+
+typedef struct msg_cp {
+ int cp_uuid_off;
+ int cp_uuid_len;
+ int cp_drivertype;
+} msg_cp_t;
+
+typedef struct msg_lock {
+ int ro;
+ int enforce;
+ int uuid_off;
+ int uuid_len;
+} msg_lock_t;
+
+#define READ 0
+#define WRITE 1
+
+/*Control Messages between manager and tapdev*/
+#define CTLMSG_PARAMS 1
+#define CTLMSG_IMG 2
+#define CTLMSG_IMG_FAIL 3
+#define CTLMSG_NEWDEV 4
+#define CTLMSG_NEWDEV_RSP 5
+#define CTLMSG_NEWDEV_FAIL 6
+#define CTLMSG_CLOSE 7
+#define CTLMSG_CLOSE_RSP 8
+#define CTLMSG_PID 9
+#define CTLMSG_PID_RSP 10
+#define CTLMSG_CHECKPOINT 11
+#define CTLMSG_CHECKPOINT_RSP 12
+#define CTLMSG_LOCK 13
+#define CTLMSG_LOCK_RSP 14
+#define CTLMSG_PAUSE 15
+#define CTLMSG_PAUSE_RSP 16
+#define CTLMSG_RESUME 17
+#define CTLMSG_RESUME_RSP 18
+
+#define TAPDISK_STORAGE_TYPE_NFS 1
+#define TAPDISK_STORAGE_TYPE_EXT 2
+#define TAPDISK_STORAGE_TYPE_LVM 3
+#define TAPDISK_STORAGE_TYPE_DEFAULT TAPDISK_STORAGE_TYPE_EXT
+
+/* Abitrary values, must match the underlying driver... */
+#define MAX_TAP_DEV 256
+
+/* Accessing attached data page mappings */
+#define MMAP_PAGES \
+ (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_vstart,_req,_seg) \
+ ((_vstart) + \
+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) + \
+ ((_seg) * getpagesize()))
+
+/* Defines that are only used by library clients */
+
+#ifndef __COMPILING_BLKTAP_LIB
+
+static char *blkif_op_name[] = {
+ [BLKIF_OP_READ] = "READ",
+ [BLKIF_OP_WRITE] = "WRITE",
+};
+
+#endif /* __COMPILING_BLKTAP_LIB */
+
+#endif /* __BLKTAPLIB_H__ */
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_JOURNAL_H_
+#define _VHD_JOURNAL_H_
+
+#include <inttypes.h>
+
+#include "libvhd.h"
+
+#define VHD_JOURNAL_METADATA 0x01
+#define VHD_JOURNAL_DATA 0x02
+
+#define VHD_JOURNAL_HEADER_COOKIE "vjournal"
+#define VHD_JOURNAL_ENTRY_COOKIE 0xaaaa12344321aaaa
+
+typedef struct vhd_journal_header {
+ char cookie[8];
+ vhd_uuid_t uuid;
+ uint64_t vhd_footer_offset;
+ uint32_t journal_data_entries;
+ uint32_t journal_metadata_entries;
+ uint64_t journal_data_offset;
+ uint64_t journal_metadata_offset;
+ uint64_t journal_eof;
+ char pad[448];
+} vhd_journal_header_t;
+
+typedef struct vhd_journal {
+ char *jname;
+ int jfd;
+ int is_block; /* is jfd a block device */
+ vhd_journal_header_t header;
+ vhd_context_t vhd;
+} vhd_journal_t;
+
+int vhd_journal_create(vhd_journal_t *, const char *file, const char *jfile);
+int vhd_journal_open(vhd_journal_t *, const char *file, const char *jfile);
+int vhd_journal_add_block(vhd_journal_t *, uint32_t block, char mode);
+int vhd_journal_commit(vhd_journal_t *);
+int vhd_journal_revert(vhd_journal_t *);
+int vhd_journal_close(vhd_journal_t *);
+int vhd_journal_remove(vhd_journal_t *);
+
+#endif
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_LIB_H_
+#define _VHD_LIB_H_
+
+#include <string.h>
+#if defined(__linux__)
+#include <endian.h>
+#include <byteswap.h>
+#elif defined(__NetBSD__)
+#include <sys/endian.h>
+#include <sys/bswap.h>
+#endif
+
+#include "vhd-uuid.h"
+#include "vhd.h"
+
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#if defined(__linux__)
+ #define BE16_IN(foo) (*(foo)) = bswap_16(*(foo))
+ #define BE32_IN(foo) (*(foo)) = bswap_32(*(foo))
+ #define BE64_IN(foo) (*(foo)) = bswap_64(*(foo))
+ #define BE16_OUT(foo) (*(foo)) = bswap_16(*(foo))
+ #define BE32_OUT(foo) (*(foo)) = bswap_32(*(foo))
+ #define BE64_OUT(foo) (*(foo)) = bswap_64(*(foo))
+#elif defined(__NetBSD__)
+ #define BE16_IN(foo) (*(foo)) = bswap16(*(foo))
+ #define BE32_IN(foo) (*(foo)) = bswap32(*(foo))
+ #define BE64_IN(foo) (*(foo)) = bswap64(*(foo))
+ #define BE16_OUT(foo) (*(foo)) = bswap16(*(foo))
+ #define BE32_OUT(foo) (*(foo)) = bswap32(*(foo))
+ #define BE64_OUT(foo) (*(foo)) = bswap64(*(foo))
+#endif
+#else
+ #define BE16_IN(foo)
+ #define BE32_IN(foo)
+ #define BE64_IN(foo)
+ #define BE32_OUT(foo)
+ #define BE32_OUT(foo)
+ #define BE64_OUT(foo)
+#endif
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+#define VHD_MAX_NAME_LEN 1024
+
+#define VHD_BLOCK_SHIFT 21
+#define VHD_BLOCK_SIZE (1ULL << VHD_BLOCK_SHIFT)
+
+#define UTF_16 "UTF-16"
+#define UTF_16LE "UTF-16LE"
+#define UTF_16BE "UTF-16BE"
+
+#define VHD_OPEN_RDONLY 0x00001
+#define VHD_OPEN_RDWR 0x00002
+#define VHD_OPEN_FAST 0x00004
+#define VHD_OPEN_STRICT 0x00008
+#define VHD_OPEN_IGNORE_DISABLED 0x00010
+
+#define VHD_FLAG_CREAT_PARENT_RAW 0x00001
+
+#define vhd_flag_set(word, flag) ((word) |= (flag))
+#define vhd_flag_clear(word, flag) ((word) &= ~(flag))
+#define vhd_flag_test(word, flag) ((word) & (flag))
+
+
+#define ENABLE_FAILURE_TESTING
+#define FAIL_REPARENT_BEGIN 0
+#define FAIL_REPARENT_LOCATOR 1
+#define FAIL_REPARENT_END 2
+#define FAIL_RESIZE_BEGIN 3
+#define FAIL_RESIZE_DATA_MOVED 4
+#define FAIL_RESIZE_METADATA_MOVED 5
+#define FAIL_RESIZE_END 6
+#define NUM_FAIL_TESTS 7
+
+#ifdef ENABLE_FAILURE_TESTING
+#define TEST_FAIL_AT(point) \
+ if (TEST_FAIL[point]) { \
+ printf("Failing at %s\n", ENV_VAR_FAIL[point]); exit(EINVAL); }
+#define TEST_FAIL_EXTERN_VARS \
+ extern const char* ENV_VAR_FAIL[]; \
+ extern int TEST_FAIL[];
+#else
+#define TEST_FAIL_AT(point)
+#define TEST_FAIL_EXTERN_VARS
+#endif // ENABLE_FAILURE_TESTING
+
+
+static const char VHD_POISON_COOKIE[] = "v_poison";
+
+typedef struct hd_ftr vhd_footer_t;
+typedef struct dd_hdr vhd_header_t;
+typedef struct vhd_bat vhd_bat_t;
+typedef struct vhd_batmap vhd_batmap_t;
+typedef struct dd_batmap_hdr vhd_batmap_header_t;
+typedef struct prt_loc vhd_parent_locator_t;
+typedef struct vhd_context vhd_context_t;
+typedef uint32_t vhd_flag_creat_t;
+
+struct vhd_bat {
+ uint32_t spb;
+ uint32_t entries;
+ uint32_t *bat;
+};
+
+struct vhd_batmap {
+ vhd_batmap_header_t header;
+ char *map;
+};
+
+struct vhd_context {
+ int fd;
+ char *file;
+ int oflags;
+ int is_block;
+
+ uint32_t spb;
+ uint32_t bm_secs;
+
+ vhd_header_t header;
+ vhd_footer_t footer;
+ vhd_bat_t bat;
+ vhd_batmap_t batmap;
+};
+
+static inline uint32_t
+secs_round_up(uint64_t bytes)
+{
+ return ((bytes + (VHD_SECTOR_SIZE - 1)) >> VHD_SECTOR_SHIFT);
+}
+
+static inline uint32_t
+secs_round_up_no_zero(uint64_t bytes)
+{
+ return (secs_round_up(bytes) ? : 1);
+}
+
+static inline uint64_t
+vhd_sectors_to_bytes(uint64_t sectors)
+{
+ return sectors << VHD_SECTOR_SHIFT;
+}
+
+static inline uint64_t
+vhd_bytes_padded(uint64_t bytes)
+{
+ return vhd_sectors_to_bytes(secs_round_up_no_zero(bytes));
+}
+
+static inline int
+vhd_type_dynamic(vhd_context_t *ctx)
+{
+ return (ctx->footer.type == HD_TYPE_DYNAMIC ||
+ ctx->footer.type == HD_TYPE_DIFF);
+}
+
+static inline int
+vhd_creator_tapdisk(vhd_context_t *ctx)
+{
+ return !strncmp(ctx->footer.crtr_app, "tap", 3);
+}
+
+static inline int
+vhd_disabled(vhd_context_t *ctx)
+{
+ return (!memcmp(ctx->footer.cookie,
+ VHD_POISON_COOKIE, sizeof(ctx->footer.cookie)));
+}
+
+static inline size_t
+vhd_parent_locator_size(vhd_parent_locator_t *loc)
+{
+ /*
+ * MICROSOFT_COMPAT
+ * data_space *should* be in sectors,
+ * but sometimes we find it in bytes
+ */
+ if (loc->data_space < 512)
+ return vhd_sectors_to_bytes(loc->data_space);
+ else if (loc->data_space % 512 == 0)
+ return loc->data_space;
+ else
+ return 0;
+}
+
+static inline int
+vhd_parent_raw(vhd_context_t *ctx)
+{
+ return vhd_uuid_is_nil(&ctx->header.prt_uuid);
+}
+
+void libvhd_set_log_level(int);
+
+int vhd_test_file_fixed(const char *, int *);
+
+uint32_t vhd_time(time_t time);
+size_t vhd_time_to_string(uint32_t timestamp, char *target);
+uint32_t vhd_chs(uint64_t size);
+
+uint32_t vhd_checksum_footer(vhd_footer_t *);
+uint32_t vhd_checksum_header(vhd_header_t *);
+uint32_t vhd_checksum_batmap(vhd_batmap_t *);
+
+void vhd_footer_in(vhd_footer_t *);
+void vhd_footer_out(vhd_footer_t *);
+void vhd_header_in(vhd_header_t *);
+void vhd_header_out(vhd_header_t *);
+void vhd_bat_in(vhd_bat_t *);
+void vhd_bat_out(vhd_bat_t *);
+void vhd_batmap_header_in(vhd_batmap_t *);
+void vhd_batmap_header_out(vhd_batmap_t *);
+
+int vhd_validate_footer(vhd_footer_t *footer);
+int vhd_validate_header(vhd_header_t *header);
+int vhd_validate_batmap_header(vhd_batmap_t *batmap);
+int vhd_validate_batmap(vhd_batmap_t *batmap);
+int vhd_validate_platform_code(uint32_t code);
+
+int vhd_open(vhd_context_t *, const char *file, int flags);
+void vhd_close(vhd_context_t *);
+int vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t);
+/* vhd_snapshot: the bytes parameter is optional and can be 0 if the snapshot
+ * is to have the same size as the (first non-empty) parent */
+int vhd_snapshot(const char *snapshot, uint64_t bytes, const char *parent,
+ vhd_flag_creat_t);
+
+int vhd_hidden(vhd_context_t *, int *);
+int vhd_chain_depth(vhd_context_t *, int *);
+
+off_t vhd_position(vhd_context_t *);
+int vhd_seek(vhd_context_t *, off_t, int);
+int vhd_read(vhd_context_t *, void *, size_t);
+int vhd_write(vhd_context_t *, void *, size_t);
+
+int vhd_offset(vhd_context_t *, uint32_t, uint32_t *);
+
+int vhd_end_of_headers(vhd_context_t *ctx, off_t *off);
+int vhd_end_of_data(vhd_context_t *ctx, off_t *off);
+int vhd_batmap_header_offset(vhd_context_t *ctx, off_t *off);
+
+int vhd_get_header(vhd_context_t *);
+int vhd_get_footer(vhd_context_t *);
+int vhd_get_bat(vhd_context_t *);
+int vhd_get_batmap(vhd_context_t *);
+
+void vhd_put_header(vhd_context_t *);
+void vhd_put_footer(vhd_context_t *);
+void vhd_put_bat(vhd_context_t *);
+void vhd_put_batmap(vhd_context_t *);
+
+int vhd_has_batmap(vhd_context_t *);
+int vhd_batmap_test(vhd_context_t *, vhd_batmap_t *, uint32_t);
+void vhd_batmap_set(vhd_context_t *, vhd_batmap_t *, uint32_t);
+void vhd_batmap_clear(vhd_context_t *, vhd_batmap_t *, uint32_t);
+
+int vhd_get_phys_size(vhd_context_t *, off_t *);
+int vhd_set_phys_size(vhd_context_t *, off_t);
+
+int vhd_bitmap_test(vhd_context_t *, char *, uint32_t);
+void vhd_bitmap_set(vhd_context_t *, char *, uint32_t);
+void vhd_bitmap_clear(vhd_context_t *, char *, uint32_t);
+
+int vhd_parent_locator_count(vhd_context_t *);
+int vhd_parent_locator_get(vhd_context_t *, char **);
+int vhd_parent_locator_read(vhd_context_t *, vhd_parent_locator_t *, char **);
+int vhd_find_parent(vhd_context_t *, const char *, char **);
+int vhd_parent_locator_write_at(vhd_context_t *, const char *,
+ off_t, uint32_t, size_t,
+ vhd_parent_locator_t *);
+
+int vhd_header_decode_parent(vhd_context_t *, vhd_header_t *, char **);
+int vhd_change_parent(vhd_context_t *, char *parent_path, int raw);
+
+int vhd_read_footer(vhd_context_t *, vhd_footer_t *);
+int vhd_read_footer_at(vhd_context_t *, vhd_footer_t *, off_t);
+int vhd_read_footer_strict(vhd_context_t *, vhd_footer_t *);
+int vhd_read_header(vhd_context_t *, vhd_header_t *);
+int vhd_read_header_at(vhd_context_t *, vhd_header_t *, off_t);
+int vhd_read_bat(vhd_context_t *, vhd_bat_t *);
+int vhd_read_batmap(vhd_context_t *, vhd_batmap_t *);
+int vhd_read_bitmap(vhd_context_t *, uint32_t block, char **bufp);
+int vhd_read_block(vhd_context_t *, uint32_t block, char **bufp);
+
+int vhd_write_footer(vhd_context_t *, vhd_footer_t *);
+int vhd_write_footer_at(vhd_context_t *, vhd_footer_t *, off_t);
+int vhd_write_header(vhd_context_t *, vhd_header_t *);
+int vhd_write_header_at(vhd_context_t *, vhd_header_t *, off_t);
+int vhd_write_bat(vhd_context_t *, vhd_bat_t *);
+int vhd_write_batmap(vhd_context_t *, vhd_batmap_t *);
+int vhd_write_bitmap(vhd_context_t *, uint32_t block, char *bitmap);
+int vhd_write_block(vhd_context_t *, uint32_t block, char *data);
+
+int vhd_io_read(vhd_context_t *, char *, uint64_t, uint32_t);
+int vhd_io_write(vhd_context_t *, char *, uint64_t, uint32_t);
+
+#endif
--- /dev/null
+/*
+ * list.h
+ *
+ * This is a subset of linux's list.h intended to be used in user-space.
+ * XXX The namespace conflicts with NetBSD's <sys/queue.h>
+ *
+ */
+
+#ifndef __LIST_H__
+#define __LIST_H__
+
+#define LIST_POISON1 ((void *) 0x00100100)
+#define LIST_POISON2 ((void *) 0x00200200)
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+/* XXX workaround for conflicts. The list API should use its own
+ * namespace prefix, i.e. BLK_
+ */
+#ifdef LIST_HEAD_INIT
+#undef LIST_HEAD_INIT
+#endif
+#ifndef LIST_HEAD
+#undef LIST_HEAD
+#endif
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+ struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+ list->next = list;
+ list->prev = list;
+}
+
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static inline void list_del(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->next = LIST_POISON1;
+ entry->prev = LIST_POISON2;
+}
+
+static inline void list_del_init(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ INIT_LIST_HEAD(entry);
+}
+
+static inline int list_empty(const struct list_head *head)
+{
+ return head->next == head;
+}
+
+static inline int list_is_last(const struct list_head *list,
+ const struct list_head *head)
+{
+ return list->next == head;
+}
+
+static inline void __list_splice(const struct list_head *list,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ struct list_head *first = list->next;
+ struct list_head *last = list->prev;
+
+ first->prev = prev;
+ prev->next = first;
+
+ last->next = next;
+ next->prev = last;
+}
+
+static inline void list_splice(const struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head, head->next);
+}
+
+#define list_entry(ptr, type, member) \
+ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+#define list_for_each_entry(pos, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_safe(pos, n, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member), \
+ n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#endif /* __LIST_H__ */
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LVM_UTIL_H_
+#define _LVM_UTIL_H_
+
+#include <inttypes.h>
+
+#define MAX_NAME_SIZE 256
+
+#define LVM_SEG_TYPE_LINEAR 1
+#define LVM_SEG_TYPE_UNKNOWN 2
+
+struct lv_segment {
+ uint8_t type;
+ char device[MAX_NAME_SIZE];
+ uint64_t pe_start;
+ uint64_t pe_size;
+};
+
+struct lv {
+ char name[MAX_NAME_SIZE];
+ uint64_t size;
+ uint32_t segments;
+ struct lv_segment first_segment;
+};
+
+struct pv {
+ char name[MAX_NAME_SIZE];
+ uint64_t start;
+};
+
+struct vg {
+ char name[MAX_NAME_SIZE];
+ uint64_t extent_size;
+
+ int pv_cnt;
+ struct pv *pvs;
+
+ int lv_cnt;
+ struct lv *lvs;
+};
+
+int lvm_scan_vg(const char *vg_name, struct vg *vg);
+void lvm_free_vg(struct vg *vg);
+
+#endif
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _RELATIVE_PATH_H_
+#define _RELATIVE_PATH_H_
+
+#include <syslog.h>
+
+#define DELIMITER '/'
+#define MAX_NAME_LEN 1000
+
+#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a)
+
+/*
+ * returns a relative path from @src to @dest
+ * result should be freed
+ */
+char *relative_path_to(char *src, char *dest, int *err);
+
+#endif
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_MESSAGE_H_
+#define _TAPDISK_MESSAGE_H_
+
+#include <inttypes.h>
+#include <sys/types.h>
+
+#define TAPDISK_MESSAGE_MAX_PATH_LENGTH 256
+#define TAPDISK_MESSAGE_STRING_LENGTH 256
+
+#define TAPDISK_MESSAGE_MAX_MINORS \
+ ((TAPDISK_MESSAGE_MAX_PATH_LENGTH / sizeof(int)) - 1)
+
+#define TAPDISK_MESSAGE_FLAG_SHARED 0x01
+#define TAPDISK_MESSAGE_FLAG_RDONLY 0x02
+#define TAPDISK_MESSAGE_FLAG_ADD_CACHE 0x04
+#define TAPDISK_MESSAGE_FLAG_VHD_INDEX 0x08
+#define TAPDISK_MESSAGE_FLAG_LOG_DIRTY 0x10
+
+typedef struct tapdisk_message tapdisk_message_t;
+typedef uint8_t tapdisk_message_flag_t;
+typedef struct tapdisk_message_image tapdisk_message_image_t;
+typedef struct tapdisk_message_params tapdisk_message_params_t;
+typedef struct tapdisk_message_string tapdisk_message_string_t;
+typedef struct tapdisk_message_response tapdisk_message_response_t;
+typedef struct tapdisk_message_minors tapdisk_message_minors_t;
+typedef struct tapdisk_message_list tapdisk_message_list_t;
+
+struct tapdisk_message_params {
+ tapdisk_message_flag_t flags;
+
+ uint8_t storage;
+ uint32_t devnum;
+ uint32_t domid;
+ uint16_t path_len;
+ char path[TAPDISK_MESSAGE_MAX_PATH_LENGTH];
+};
+
+struct tapdisk_message_image {
+ uint64_t sectors;
+ uint32_t sector_size;
+ uint32_t info;
+};
+
+struct tapdisk_message_string {
+ char text[TAPDISK_MESSAGE_STRING_LENGTH];
+};
+
+struct tapdisk_message_response {
+ int error;
+ char message[TAPDISK_MESSAGE_STRING_LENGTH];
+};
+
+struct tapdisk_message_minors {
+ int count;
+ int list[TAPDISK_MESSAGE_MAX_MINORS];
+};
+
+struct tapdisk_message_list {
+ int count;
+ int minor;
+ int state;
+ char path[TAPDISK_MESSAGE_MAX_PATH_LENGTH];
+};
+
+struct tapdisk_message {
+ uint16_t type;
+ uint16_t cookie;
+
+ union {
+ pid_t tapdisk_pid;
+ tapdisk_message_image_t image;
+ tapdisk_message_params_t params;
+ tapdisk_message_string_t string;
+ tapdisk_message_minors_t minors;
+ tapdisk_message_response_t response;
+ tapdisk_message_list_t list;
+ } u;
+};
+
+enum tapdisk_message_id {
+ TAPDISK_MESSAGE_ERROR = 1,
+ TAPDISK_MESSAGE_RUNTIME_ERROR,
+ TAPDISK_MESSAGE_PID,
+ TAPDISK_MESSAGE_PID_RSP,
+ TAPDISK_MESSAGE_ATTACH,
+ TAPDISK_MESSAGE_ATTACH_RSP,
+ TAPDISK_MESSAGE_OPEN,
+ TAPDISK_MESSAGE_OPEN_RSP,
+ TAPDISK_MESSAGE_PAUSE,
+ TAPDISK_MESSAGE_PAUSE_RSP,
+ TAPDISK_MESSAGE_RESUME,
+ TAPDISK_MESSAGE_RESUME_RSP,
+ TAPDISK_MESSAGE_CLOSE,
+ TAPDISK_MESSAGE_CLOSE_RSP,
+ TAPDISK_MESSAGE_DETACH,
+ TAPDISK_MESSAGE_DETACH_RSP,
+ TAPDISK_MESSAGE_LIST_MINORS,
+ TAPDISK_MESSAGE_LIST_MINORS_RSP,
+ TAPDISK_MESSAGE_LIST,
+ TAPDISK_MESSAGE_LIST_RSP,
+ TAPDISK_MESSAGE_FORCE_SHUTDOWN,
+ TAPDISK_MESSAGE_EXIT,
+};
+
+static inline char *
+tapdisk_message_name(enum tapdisk_message_id id)
+{
+ switch (id) {
+ case TAPDISK_MESSAGE_ERROR:
+ return "error";
+
+ case TAPDISK_MESSAGE_PID:
+ return "pid";
+
+ case TAPDISK_MESSAGE_PID_RSP:
+ return "pid response";
+
+ case TAPDISK_MESSAGE_OPEN:
+ return "open";
+
+ case TAPDISK_MESSAGE_OPEN_RSP:
+ return "open response";
+
+ case TAPDISK_MESSAGE_PAUSE:
+ return "pause";
+
+ case TAPDISK_MESSAGE_PAUSE_RSP:
+ return "pause response";
+
+ case TAPDISK_MESSAGE_RESUME:
+ return "resume";
+
+ case TAPDISK_MESSAGE_RESUME_RSP:
+ return "resume response";
+
+ case TAPDISK_MESSAGE_CLOSE:
+ return "close";
+
+ case TAPDISK_MESSAGE_FORCE_SHUTDOWN:
+ return "force shutdown";
+
+ case TAPDISK_MESSAGE_CLOSE_RSP:
+ return "close response";
+
+ case TAPDISK_MESSAGE_ATTACH:
+ return "attach";
+
+ case TAPDISK_MESSAGE_ATTACH_RSP:
+ return "attach response";
+
+ case TAPDISK_MESSAGE_DETACH:
+ return "detach";
+
+ case TAPDISK_MESSAGE_DETACH_RSP:
+ return "detach response";
+
+ case TAPDISK_MESSAGE_LIST_MINORS:
+ return "list minors";
+
+ case TAPDISK_MESSAGE_LIST_MINORS_RSP:
+ return "list minors response";
+
+ case TAPDISK_MESSAGE_LIST:
+ return "list";
+
+ case TAPDISK_MESSAGE_LIST_RSP:
+ return "list response";
+
+ case TAPDISK_MESSAGE_EXIT:
+ return "exit";
+
+ default:
+ return "unknown";
+ }
+}
+
+#endif
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_UTIL_H_
+#define _VHD_UTIL_H_
+
+int vhd_util_create(int argc, char **argv);
+int vhd_util_snapshot(int argc, char **argv);
+int vhd_util_query(int argc, char **argv);
+int vhd_util_read(int argc, char **argv);
+int vhd_util_set_field(int argc, char **argv);
+int vhd_util_repair(int argc, char **argv);
+int vhd_util_fill(int argc, char **argv);
+int vhd_util_resize(int argc, char **argv);
+int vhd_util_coalesce(int argc, char **argv);
+int vhd_util_modify(int argc, char **argv);
+int vhd_util_scan(int argc, char **argv);
+int vhd_util_check(int argc, char **argv);
+int vhd_util_revert(int argc, char **argv);
+
+#endif
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __BLKTAP2_VHD_UUID_H__
+#define __BLKTAP2_VHD_UUID_H__
+
+#if defined(__linux__)
+
+#include <uuid/uuid.h>
+typedef struct {
+ uuid_t uuid;
+} vhd_uuid_t;
+
+#elif defined(__NetBSD__)
+
+#include <uuid.h>
+
+typedef uuid_t vhd_uuid_t;
+
+#else
+
+#error "Please update vhd-uuid.h for your OS"
+
+#endif
+
+int vhd_uuid_is_nil(vhd_uuid_t *uuid);
+
+void vhd_uuid_generate(vhd_uuid_t *uuid);
+
+void vhd_uuid_to_string(vhd_uuid_t *uuid, char *out, size_t size);
+
+void vhd_uuid_from_string(vhd_uuid_t *uuid, const char *in);
+
+void vhd_uuid_copy(vhd_uuid_t *dst, vhd_uuid_t *src);
+
+void vhd_uuid_clear(vhd_uuid_t *uuid);
+
+int vhd_uuid_compare(vhd_uuid_t *uuid1, vhd_uuid_t *uuid2);
+
+#endif /* __BLKTAP2_VHD_UUID_H__ */
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __VHD_H__
+#define __VHD_H__
+
+#include <inttypes.h>
+
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#define DEBUG 1
+
+/* ---------------------------------------------------------------------- */
+/* General definitions. */
+/* ---------------------------------------------------------------------- */
+
+#define VHD_SECTOR_SIZE 512
+#define VHD_SECTOR_SHIFT 9
+
+/* ---------------------------------------------------------------------- */
+/* This is the generic disk footer, used by all disks. */
+/* ---------------------------------------------------------------------- */
+
+struct hd_ftr {
+ char cookie[8]; /* Identifies original creator of the disk */
+ u32 features; /* Feature Support -- see below */
+ u32 ff_version; /* (major,minor) version of disk file */
+ u64 data_offset; /* Abs. offset from SOF to next structure */
+ u32 timestamp; /* Creation time. secs since 1/1/2000GMT */
+ char crtr_app[4]; /* Creator application */
+ u32 crtr_ver; /* Creator version (major,minor) */
+ u32 crtr_os; /* Creator host OS */
+ u64 orig_size; /* Size at creation (bytes) */
+ u64 curr_size; /* Current size of disk (bytes) */
+ u32 geometry; /* Disk geometry */
+ u32 type; /* Disk type */
+ u32 checksum; /* 1's comp sum of this struct. */
+ vhd_uuid_t uuid; /* Unique disk ID, used for naming parents */
+ char saved; /* one-bit -- is this disk/VM in a saved state? */
+ char hidden; /* tapdisk-specific field: is this vdi hidden? */
+ char reserved[426]; /* padding */
+};
+
+/* VHD cookie string. */
+static const char HD_COOKIE[9] = "conectix";
+
+/* Feature fields in hd_ftr */
+#define HD_NO_FEATURES 0x00000000
+#define HD_TEMPORARY 0x00000001 /* disk can be deleted on shutdown */
+#define HD_RESERVED 0x00000002 /* NOTE: must always be set */
+
+/* Version field in hd_ftr */
+#define HD_FF_VERSION 0x00010000
+
+/* Known creator OS type fields in hd_ftr.crtr_os */
+#define HD_CR_OS_WINDOWS 0x5769326B /* (Wi2k) */
+#define HD_CR_OS_MACINTOSH 0x4D616320 /* (Mac ) */
+
+/*
+ * version 0.1: little endian bitmaps
+ * version 1.1: big endian bitmaps; batmap
+ * version 1.2: libvhd
+ * version 1.3: batmap version bump to 1.2
+ */
+#define VHD_VERSION(major, minor) (((major) << 16) | ((minor) & 0x0000FFFF))
+#define VHD_CURRENT_VERSION VHD_VERSION(1, 3)
+
+/* Disk geometry accessor macros. */
+/* Geometry is a triple of (cylinders (2 bytes), tracks (1 byte), and
+ * secotrs-per-track (1 byte))
+ */
+#define GEOM_GET_CYLS(_g) (((_g) >> 16) & 0xffff)
+#define GEOM_GET_HEADS(_g) (((_g) >> 8) & 0xff)
+#define GEOM_GET_SPT(_g) ((_g) & 0xff)
+
+#define GEOM_ENCODE(_c, _h, _s) (((_c) << 16) | ((_h) << 8) | (_s))
+
+/* type field in hd_ftr */
+#define HD_TYPE_NONE 0
+#define HD_TYPE_FIXED 2 /* fixed-allocation disk */
+#define HD_TYPE_DYNAMIC 3 /* dynamic disk */
+#define HD_TYPE_DIFF 4 /* differencing disk */
+
+/* String table for hd.type */
+static const char *HD_TYPE_STR[7] = {
+ "None", /* 0 */
+ "Reserved (deprecated)", /* 1 */
+ "Fixed hard disk", /* 2 */
+ "Dynamic hard disk", /* 3 */
+ "Differencing hard disk", /* 4 */
+ "Reserved (deprecated)", /* 5 */
+ "Reserved (deprecated)" /* 6 */
+};
+
+#define HD_TYPE_MAX 6
+
+struct prt_loc {
+ u32 code; /* Platform code -- see defines below. */
+ u32 data_space; /* Number of 512-byte sectors to store locator */
+ u32 data_len; /* Actual length of parent locator in bytes */
+ u32 res; /* Must be zero */
+ u64 data_offset; /* Absolute offset of locator data (bytes) */
+};
+
+/* Platform Codes */
+#define PLAT_CODE_NONE 0x0
+#define PLAT_CODE_WI2R 0x57693272 /* deprecated */
+#define PLAT_CODE_WI2K 0x5769326B /* deprecated */
+#define PLAT_CODE_W2RU 0x57327275 /* Windows relative path (UTF-16) */
+#define PLAT_CODE_W2KU 0x57326B75 /* Windows absolute path (UTF-16) */
+#define PLAT_CODE_MAC 0x4D616320 /* MacOS alias stored as a blob. */
+#define PLAT_CODE_MACX 0x4D616358 /* File URL (UTF-8), see RFC 2396. */
+
+/* ---------------------------------------------------------------------- */
+/* This is the dynamic disk header. */
+/* ---------------------------------------------------------------------- */
+
+struct dd_hdr {
+ char cookie[8]; /* Should contain "cxsparse" */
+ u64 data_offset; /* Byte offset of next record. (Unused) 0xffs */
+ u64 table_offset; /* Absolute offset to the BAT. */
+ u32 hdr_ver; /* Version of the dd_hdr (major,minor) */
+ u32 max_bat_size; /* Maximum number of entries in the BAT */
+ u32 block_size; /* Block size in bytes. Must be power of 2. */
+ u32 checksum; /* Header checksum. 1's comp of all fields. */
+ vhd_uuid_t prt_uuid; /* ID of the parent disk. */
+ u32 prt_ts; /* Modification time of the parent disk */
+ u32 res1; /* Reserved. */
+ char prt_name[512]; /* Parent unicode name. */
+ struct prt_loc loc[8]; /* Parent locator entries. */
+ char res2[256]; /* Reserved. */
+};
+
+/* VHD cookie string. */
+static const char DD_COOKIE[9] = "cxsparse";
+
+/* Version field in hd_ftr */
+#define DD_VERSION 0x00010000
+
+/* Default blocksize is 2 meg. */
+#define DD_BLOCKSIZE_DEFAULT 0x00200000
+
+#define DD_BLK_UNUSED 0xFFFFFFFF
+
+struct dd_batmap_hdr {
+ char cookie[8]; /* should contain "tdbatmap" */
+ u64 batmap_offset; /* byte offset to batmap */
+ u32 batmap_size; /* batmap size in sectors */
+ u32 batmap_version; /* version of batmap */
+ u32 checksum; /* batmap checksum -- 1's complement of batmap */
+};
+
+static const char VHD_BATMAP_COOKIE[9] = "tdbatmap";
+
+/*
+ * version 1.1: signed char checksum
+ */
+#define VHD_BATMAP_VERSION(major, minor) (((major) << 16) | ((minor) & 0x0000FFFF))
+#define VHD_BATMAP_CURRENT_VERSION VHD_BATMAP_VERSION(1, 2)
+
+/* Layout of a dynamic disk:
+ *
+ * +-------------------------------------------------+
+ * | Mirror image of HD footer (hd_ftr) (512 bytes) |
+ * +-------------------------------------------------+
+ * | Sparse drive header (dd_hdr) (1024 bytes) |
+ * +-------------------------------------------------+
+ * | BAT (Block allocation table) |
+ * | - Array of absolute sector offsets into the |
+ * | file (u32). |
+ * | - Rounded up to a sector boundary. |
+ * | - Unused entries are marked as 0xFFFFFFFF |
+ * | - max entries in dd_hdr->max_bat_size |
+ * +-------------------------------------------------+
+ * | Data Block 0 |
+ * | Bitmap (padded to 512 byte sector boundary) |
+ * | - each bit indicates whether the associated |
+ * | sector within this block is used. |
+ * | Data |
+ * | - power-of-two multiple of sectors. |
+ * | - default 2MB (4096 * 512) |
+ * | - Any entries with zero in bitmap should be |
+ * | zero on disk |
+ * +-------------------------------------------------+
+ * | Data Block 1 |
+ * +-------------------------------------------------+
+ * | ... |
+ * +-------------------------------------------------+
+ * | Data Block n |
+ * +-------------------------------------------------+
+ * | HD Footer (511 bytes) |
+ * +-------------------------------------------------+
+ */
+
+#endif
--- /dev/null
+XEN_ROOT = $(CURDIR)/../../..
+BLKTAP_ROOT := ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+ifeq ($(LVM_UTIL_TEST),y)
+TEST := lvm-util
+endif
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -I../include
+CFLAGS += -D_GNU_SOURCE
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS += -fPIC
+endif
+
+LVM-OBJS := lvm-util.o
+
+all: build
+
+build: $(TEST) $(LVM-OBJS)
+
+install: all
+
+lvm-util: lvm-util.o
+ $(CC) -DLVM_UTIL $(LDFLAGS) -o lvm-util lvm-util.c
+
+clean:
+ rm -rf *.o *.opic *~ $(DEPS) $(IBIN)
+
+distclean: clean
+
+.PHONY: all build clean distclean install lvm-util
+
+-include $(DEPS)
--- /dev/null
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "lvm-util.h"
+
+#define _NAME "%255s"
+static char line[1024];
+
+static inline int
+lvm_read_line(FILE *scan)
+{
+ memset(line, 0, sizeof(line));
+ return (fscanf(scan, "%1023[^\n]", line) != 1);
+}
+
+static inline int
+lvm_next_line(FILE *scan)
+{
+ return (fscanf(scan, "%1023[\n]", line) != 1);
+}
+
+static int
+lvm_copy_name(char *dst, const char *src, size_t size)
+{
+ if (strnlen(src, size) == size)
+ return -ENAMETOOLONG;
+
+ strcpy(dst, src);
+ return 0;
+}
+
+static int
+lvm_parse_pv(struct vg *vg, const char *name, int pvs, uint64_t start)
+{
+ int i, err;
+ struct pv *pv;
+
+ pv = NULL;
+
+ if (!vg->pvs) {
+ vg->pvs = calloc(pvs, sizeof(struct pv));
+ if (!vg->pvs)
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < pvs; i++) {
+ pv = vg->pvs + i;
+
+ if (!pv->name[0])
+ break;
+
+ if (!strcmp(pv->name, name))
+ return -EEXIST;
+ }
+
+ if (!pv)
+ return -ENOENT;
+
+ if (i == pvs)
+ return -ENOMEM;
+
+ err = lvm_copy_name(pv->name, name, sizeof(pv->name) - 1);
+ if (err)
+ return err;
+
+ pv->start = start;
+ return 0;
+}
+
+static int
+lvm_open_vg(const char *vgname, struct vg *vg)
+{
+ FILE *scan;
+ int i, err, pvs, lvs;
+ char *cmd, pvname[256];
+ uint64_t size, pv_start;
+
+ memset(vg, 0, sizeof(*vg));
+
+ err = asprintf(&cmd, "/usr/sbin/vgs %s --noheadings --nosuffix --units=b "
+ "--options=vg_name,vg_extent_size,lv_count,pv_count,"
+ "pv_name,pe_start --unbuffered 2> /dev/null", vgname);
+ if (err == -1)
+ return -ENOMEM;
+
+ errno = 0;
+ scan = popen(cmd, "r");
+ if (!scan) {
+ err = (errno ? -errno : ENOMEM);
+ goto out;
+ }
+
+ for (;;) {
+ if (lvm_read_line(scan))
+ break;
+
+ err = -EINVAL;
+ if (sscanf(line, _NAME" %"SCNu64" %d %d "_NAME" %"SCNu64,
+ vg->name, &size, &lvs, &pvs, pvname, &pv_start) != 6)
+ goto out;
+
+ if (strcmp(vg->name, vgname))
+ goto out;
+
+ err = lvm_parse_pv(vg, pvname, pvs, pv_start);
+ if (err)
+ goto out;
+
+ if (lvm_next_line(scan))
+ break;
+ }
+
+ err = -EINVAL;
+ if (strcmp(vg->name, vgname))
+ goto out;
+
+ for (i = 0; i < pvs; i++)
+ if (!vg->pvs[i].name[0])
+ goto out;
+
+ err = -ENOMEM;
+ vg->lvs = calloc(lvs, sizeof(struct lv));
+ if (!vg->lvs)
+ goto out;
+
+ err = 0;
+ vg->lv_cnt = lvs;
+ vg->pv_cnt = pvs;
+ vg->extent_size = size;
+
+out:
+ if (scan)
+ pclose(scan);
+ if (err)
+ lvm_free_vg(vg);
+ free(cmd);
+ return err;
+}
+
+static int
+lvm_parse_lv_devices(struct vg *vg, struct lv_segment *seg, char *devices)
+{
+ int i;
+ uint64_t start, pe_start;
+
+ for (i = 0; i < strlen(devices); i++)
+ if (strchr(",()", devices[i]))
+ devices[i] = ' ';
+
+ if (sscanf(devices, _NAME" %"SCNu64, seg->device, &start) != 2)
+ return -EINVAL;
+
+ pe_start = -1;
+ for (i = 0; i < vg->pv_cnt; i++)
+ if (!strcmp(vg->pvs[i].name, seg->device)) {
+ pe_start = vg->pvs[i].start;
+ break;
+ }
+
+ if (pe_start == -1)
+ return -EINVAL;
+
+ seg->pe_start = (start * vg->extent_size) + pe_start;
+ return 0;
+}
+
+static int
+lvm_scan_lvs(struct vg *vg)
+{
+ char *cmd;
+ FILE *scan;
+ int i, err;
+
+ err = asprintf(&cmd, "/usr/sbin/lvs %s --noheadings --nosuffix --units=b "
+ "--options=lv_name,lv_size,segtype,seg_count,seg_start,"
+ "seg_size,devices --unbuffered 2> /dev/null", vg->name);
+ if (err == -1)
+ return -ENOMEM;
+
+ errno = 0;
+ scan = popen(cmd, "r");
+ if (!scan) {
+ err = (errno ? -errno : -ENOMEM);
+ goto out;
+ }
+
+ for (i = 0;;) {
+ int segs;
+ struct lv *lv;
+ struct lv_segment seg;
+ uint64_t size, seg_start;
+ char type[32], name[256], dev[256], devices[1024];
+
+ if (i >= vg->lv_cnt)
+ break;
+
+ if (lvm_read_line(scan)) {
+ vg->lv_cnt = i;
+ break;
+ }
+
+ err = -EINVAL;
+ lv = vg->lvs + i;
+
+ if (sscanf(line, _NAME" %"SCNu64" %31s %u %"SCNu64" %"SCNu64" %1023s",
+ name, &size, type, &segs, &seg_start,
+ &seg.pe_size, devices) != 7)
+ goto out;
+
+ if (seg_start)
+ goto next;
+
+ if (!strcmp(type, "linear"))
+ seg.type = LVM_SEG_TYPE_LINEAR;
+ else
+ seg.type = LVM_SEG_TYPE_UNKNOWN;
+
+ if (lvm_parse_lv_devices(vg, &seg, devices))
+ goto out;
+
+ i++;
+ lv->size = size;
+ lv->segments = segs;
+ lv->first_segment = seg;
+
+ err = lvm_copy_name(lv->name, name, sizeof(lv->name) - 1);
+ if (err)
+ goto out;
+ err = -EINVAL;
+
+ next:
+ if (lvm_next_line(scan))
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ if (scan)
+ pclose(scan);
+ free(cmd);
+ return err;
+}
+
+void
+lvm_free_vg(struct vg *vg)
+{
+ free(vg->lvs);
+ free(vg->pvs);
+ memset(vg, 0, sizeof(*vg));
+}
+
+int
+lvm_scan_vg(const char *vg_name, struct vg *vg)
+{
+ int err;
+
+ memset(vg, 0, sizeof(*vg));
+
+ err = lvm_open_vg(vg_name, vg);
+ if (err)
+ return err;
+
+ err = lvm_scan_lvs(vg);
+ if (err) {
+ lvm_free_vg(vg);
+ return err;
+ }
+
+ return 0;
+}
+
+#ifdef LVM_UTIL
+static int
+usage(void)
+{
+ printf("usage: lvm-util <vgname>\n");
+ exit(EINVAL);
+}
+
+int
+main(int argc, char **argv)
+{
+ int i, err;
+ struct vg vg;
+ struct pv *pv;
+ struct lv *lv;
+ struct lv_segment *seg;
+
+ if (argc != 2)
+ usage();
+
+ err = lvm_scan_vg(argv[1], &vg);
+ if (err) {
+ printf("scan failed: %d\n", err);
+ return (err >= 0 ? err : -err);
+ }
+
+
+ printf("vg %s: extent_size: %"PRIu64", pvs: %d, lvs: %d\n",
+ vg.name, vg.extent_size, vg.pv_cnt, vg.lv_cnt);
+
+ for (i = 0; i < vg.pv_cnt; i++) {
+ pv = vg.pvs + i;
+ printf("pv %s: start %"PRIu64"\n", pv->name, pv->start);
+ }
+
+ for (i = 0; i < vg.lv_cnt; i++) {
+ lv = vg.lvs + i;
+ seg = &lv->first_segment;
+ printf("lv %s: size: %"PRIu64", segments: %u, type: %u, "
+ "dev: %s, pe_start: %"PRIu64", pe_size: %"PRIu64"\n",
+ lv->name, lv->size, lv->segments, seg->type,
+ seg->device, seg->pe_start, seg->pe_size);
+ }
+
+ lvm_free_vg(&vg);
+ return 0;
+}
+#endif
--- /dev/null
+XEN_ROOT=$(CURDIR)/../../..
+BLKTAP_ROOT := ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+SUBDIRS-y :=
+SUBDIRS-y += lib
+
+IBIN = vhd-util vhd-update
+INST_DIR = $(sbindir)
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -I../include
+CFLAGS += -D_GNU_SOURCE
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS += -fPIC
+endif
+
+ifeq ($(VHD_STATIC),y)
+CFLAGS += -static
+endif
+
+LIBS := -Llib -lvhd
+
+all: subdirs-all build
+
+build: $(IBIN)
+
+LIBS_DEPENDS := lib/libvhd.so lib/vhd.a
+$(LIBS_DEPENDS):subdirs-all
+
+vhd-util: vhd-util.o $(LIBS_DEPENDS)
+ $(CC) $(LDFLAGS) -o vhd-util vhd-util.o $(LIBS) $(APPEND_LDFLAGS)
+
+vhd-update: vhd-update.o $(LIBS_DEPENDS)
+ $(CC) $(LDFLAGS) -o vhd-update vhd-update.o $(LIBS) $(APPEND_LDFLAGS)
+
+install: all
+ $(MAKE) subdirs-install
+ $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+ $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR)
+
+clean: subdirs-clean
+ rm -rf *.o *~ $(DEPS) $(IBIN)
+
+distclean: clean
+
+.PHONY: all build clean distclean install vhd-util vhd-update
+
+-include $(DEPS)
--- /dev/null
+XEN_ROOT=$(CURDIR)/../../../..
+BLKTAP_ROOT := ../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+LIBVHD-MAJOR = 1.0
+LIBVHD-MINOR = 0
+LIBVHD-SONAME = libvhd.so.$(LIBVHD-MAJOR)
+
+LVM-UTIL-OBJ := $(BLKTAP_ROOT)/lvm/lvm-util.o
+
+LIBVHD-BUILD := libvhd.a
+
+INST-DIR = $(libdir)
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -I../../include
+CFLAGS += -D_GNU_SOURCE
+CFLAGS += -fPIC
+
+ifeq ($(CONFIG_Linux),y)
+LIBS := -luuid
+endif
+
+ifeq ($(CONFIG_LIBICONV),y)
+LIBS += -liconv
+endif
+
+LIB-SRCS := libvhd.c
+LIB-SRCS += libvhd-journal.c
+LIB-SRCS += vhd-util-coalesce.c
+LIB-SRCS += vhd-util-create.c
+LIB-SRCS += vhd-util-fill.c
+LIB-SRCS += vhd-util-modify.c
+LIB-SRCS += vhd-util-query.c
+LIB-SRCS += vhd-util-read.c
+LIB-SRCS += vhd-util-repair.c
+LIB-SRCS += vhd-util-resize.c
+LIB-SRCS += vhd-util-revert.c
+LIB-SRCS += vhd-util-set-field.c
+LIB-SRCS += vhd-util-snapshot.c
+LIB-SRCS += vhd-util-scan.c
+LIB-SRCS += vhd-util-check.c
+LIB-SRCS += vhd-util-uuid.c
+LIB-SRCS += relative-path.c
+LIB-SRCS += atomicio.c
+
+LIB-OBJS = $(patsubst %.c,%.o,$(LIB-SRCS))
+LIB-OBJS += $(LVM-UTIL-OBJ)
+
+LIB-PICOBJS = $(patsubst %.o,%.opic,$(LIB-OBJS))
+
+LIBVHD = libvhd.a libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
+
+all: build
+
+build: libvhd.a libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
+
+libvhd.a: $(LIB-OBJS)
+ $(AR) rc $@ $^
+
+libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR): $(LIB-PICOBJS)
+ $(CC) -Wl,$(SONAME_LDFLAG),$(LIBVHD-SONAME) $(SHLIB_LDFLAGS) \
+ $(LDFLAGS) -o libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $^ $(LIBS)
+ ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) libvhd.so.$(LIBVHD-MAJOR)
+ ln -sf libvhd.so.$(LIBVHD-MAJOR) libvhd.so
+
+install: all
+ $(INSTALL_DIR) -p $(DESTDIR)$(INST-DIR)
+ $(INSTALL_DATA) libvhd.a $(DESTDIR)$(INST-DIR)
+ $(INSTALL_PROG) libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(DESTDIR)$(INST-DIR)
+ ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(DESTDIR)$(INST-DIR)/libvhd.so.$(LIBVHD-MAJOR)
+ ln -sf libvhd.so.$(LIBVHD-MAJOR) $(DESTDIR)$(INST-DIR)/libvhd.so
+
+clean:
+ rm -rf *.a *.so* *.o *.opic *~ $(DEPS) $(LIBVHD)
+
+distclean: clean
+
+.PHONY: all build clean distclean install libvhd
+
+-include $(DEPS)
--- /dev/null
+/*
+ * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved.
+ * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t
+atomicio(f, fd, _s, n)
+ ssize_t (*f) (int, void *, size_t);
+ int fd;
+ void *_s;
+ size_t n;
+{
+ char *s = _s;
+ size_t pos = 0;
+ ssize_t res;
+
+ while (n > pos) {
+ res = (f) (fd, s + pos, n - pos);
+ switch (res) {
+ case -1:
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ return 0;
+ case 0:
+ errno = EPIPE;
+ return pos;
+ default:
+ pos += (size_t)res;
+ }
+ }
+ return (pos);
+}
+
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "atomicio.h"
+#include "libvhd-journal.h"
+
+#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_P 1
+#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_C 2
+#define VHD_JOURNAL_ENTRY_TYPE_HEADER 3
+#define VHD_JOURNAL_ENTRY_TYPE_LOCATOR 4
+#define VHD_JOURNAL_ENTRY_TYPE_BAT 5
+#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_H 6
+#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_M 7
+#define VHD_JOURNAL_ENTRY_TYPE_DATA 8
+
+typedef struct vhd_journal_entry {
+ uint64_t cookie;
+ uint32_t type;
+ uint32_t size;
+ uint64_t offset;
+ uint32_t checksum;
+} vhd_journal_entry_t;
+
+static inline int
+vhd_journal_seek(vhd_journal_t *j, off_t offset, int whence)
+{
+ off_t off;
+
+ off = lseek(j->jfd, offset, whence);
+ if (off == (off_t)-1)
+ return -errno;
+
+ return 0;
+}
+
+static inline off_t
+vhd_journal_position(vhd_journal_t *j)
+{
+ return lseek(j->jfd, 0, SEEK_CUR);
+}
+
+static inline int
+vhd_journal_read(vhd_journal_t *j, void *buf, size_t size)
+{
+ ssize_t ret;
+
+ errno = 0;
+
+ ret = atomicio(read, j->jfd, buf, size);
+ if (ret != size)
+ return (errno ? -errno : -EIO);
+
+ return 0;
+}
+
+static inline int
+vhd_journal_write(vhd_journal_t *j, void *buf, size_t size)
+{
+ ssize_t ret;
+
+ errno = 0;
+
+ ret = atomicio(vwrite, j->jfd, buf, size);
+ if (ret != size)
+ return (errno ? -errno : -EIO);
+
+ return 0;
+}
+
+static inline int
+vhd_journal_truncate(vhd_journal_t *j, off_t length)
+{
+ int err;
+
+ err = ftruncate(j->jfd, length);
+ if (err == -1)
+ return -errno;
+
+ return 0;
+}
+
+static inline int
+vhd_journal_sync(vhd_journal_t *j)
+{
+ int err;
+
+ err = fdatasync(j->jfd);
+ if (err)
+ return -errno;
+
+ return 0;
+}
+
+static inline void
+vhd_journal_header_in(vhd_journal_header_t *header)
+{
+ BE64_IN(&header->vhd_footer_offset);
+ BE32_IN(&header->journal_data_entries);
+ BE32_IN(&header->journal_metadata_entries);
+ BE64_IN(&header->journal_data_offset);
+ BE64_IN(&header->journal_metadata_offset);
+}
+
+static inline void
+vhd_journal_header_out(vhd_journal_header_t *header)
+{
+ BE64_OUT(&header->vhd_footer_offset);
+ BE32_OUT(&header->journal_data_entries);
+ BE32_OUT(&header->journal_metadata_entries);
+ BE64_OUT(&header->journal_data_offset);
+ BE64_OUT(&header->journal_metadata_offset);
+}
+
+static int
+vhd_journal_validate_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+ int err;
+ off_t eof;
+
+ if (memcmp(header->cookie,
+ VHD_JOURNAL_HEADER_COOKIE, sizeof(header->cookie)))
+ return -EINVAL;
+
+ err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET);
+ if (err)
+ return err;
+
+ eof = vhd_journal_position(j);
+ if (eof == (off_t)-1)
+ return -errno;
+
+ if (j->header.journal_data_offset > j->header.journal_eof)
+ return -EINVAL;
+
+ if (j->header.journal_metadata_offset > j->header.journal_eof)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+vhd_journal_read_journal_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+ int err;
+ size_t size;
+
+ size = sizeof(vhd_journal_header_t);
+ err = vhd_journal_seek(j, 0, SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_journal_read(j, header, size);
+ if (err)
+ return err;
+
+ vhd_journal_header_in(header);
+
+ return vhd_journal_validate_header(j, header);
+}
+
+static int
+vhd_journal_write_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+ int err;
+ size_t size;
+ vhd_journal_header_t h;
+
+ memcpy(&h, header, sizeof(vhd_journal_header_t));
+
+ err = vhd_journal_validate_header(j, &h);
+ if (err)
+ return err;
+
+ vhd_journal_header_out(&h);
+ size = sizeof(vhd_journal_header_t);
+
+ err = vhd_journal_seek(j, 0, SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_journal_write(j, &h, size);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int
+vhd_journal_add_journal_header(vhd_journal_t *j)
+{
+ int err;
+ off_t off;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+ memset(&j->header, 0, sizeof(vhd_journal_header_t));
+
+ err = vhd_seek(vhd, 0, SEEK_END);
+ if (err)
+ return err;
+
+ off = vhd_position(vhd);
+ if (off == (off_t)-1)
+ return -errno;
+
+ err = vhd_get_footer(vhd);
+ if (err)
+ return err;
+
+ vhd_uuid_copy(&j->header.uuid, &vhd->footer.uuid);
+ memcpy(j->header.cookie,
+ VHD_JOURNAL_HEADER_COOKIE, sizeof(j->header.cookie));
+ j->header.vhd_footer_offset = off - sizeof(vhd_footer_t);
+ j->header.journal_eof = sizeof(vhd_journal_header_t);
+
+ return vhd_journal_write_header(j, &j->header);
+}
+
+static void
+vhd_journal_entry_in(vhd_journal_entry_t *entry)
+{
+ BE32_IN(&entry->type);
+ BE32_IN(&entry->size);
+ BE64_IN(&entry->offset);
+ BE64_IN(&entry->cookie);
+ BE32_IN(&entry->checksum);
+}
+
+static void
+vhd_journal_entry_out(vhd_journal_entry_t *entry)
+{
+ BE32_OUT(&entry->type);
+ BE32_OUT(&entry->size);
+ BE64_OUT(&entry->offset);
+ BE64_OUT(&entry->cookie);
+ BE32_OUT(&entry->checksum);
+}
+
+static uint32_t
+vhd_journal_checksum_entry(vhd_journal_entry_t *entry, char *buf, size_t size)
+{
+ int i;
+ unsigned char *blob;
+ uint32_t checksum, tmp;
+
+ checksum = 0;
+ tmp = entry->checksum;
+ entry->checksum = 0;
+
+ blob = (unsigned char *)entry;
+ for (i = 0; i < sizeof(vhd_journal_entry_t); i++)
+ checksum += blob[i];
+
+ blob = (unsigned char *)buf;
+ for (i = 0; i < size; i++)
+ checksum += blob[i];
+
+ entry->checksum = tmp;
+ return ~checksum;
+}
+
+static int
+vhd_journal_validate_entry(vhd_journal_entry_t *entry)
+{
+ if (entry->size == 0)
+ return -EINVAL;
+
+ if (entry->size & (VHD_SECTOR_SIZE - 1))
+ return -EINVAL;
+
+ if (entry->cookie != VHD_JOURNAL_ENTRY_COOKIE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+vhd_journal_read_entry(vhd_journal_t *j, vhd_journal_entry_t *entry)
+{
+ int err;
+
+ err = vhd_journal_read(j, entry, sizeof(vhd_journal_entry_t));
+ if (err)
+ return err;
+
+ vhd_journal_entry_in(entry);
+ return vhd_journal_validate_entry(entry);
+}
+
+static int
+vhd_journal_write_entry(vhd_journal_t *j, vhd_journal_entry_t *entry)
+{
+ int err;
+ vhd_journal_entry_t e;
+
+ err = vhd_journal_validate_entry(entry);
+ if (err)
+ return err;
+
+ memcpy(&e, entry, sizeof(vhd_journal_entry_t));
+ vhd_journal_entry_out(&e);
+
+ err = vhd_journal_write(j, &e, sizeof(vhd_journal_entry_t));
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int
+vhd_journal_validate_entry_data(vhd_journal_entry_t *entry, char *buf)
+{
+ int err;
+ uint32_t checksum;
+
+ err = 0;
+ checksum = vhd_journal_checksum_entry(entry, buf, entry->size);
+
+ if (checksum != entry->checksum)
+ return -EINVAL;
+
+ return err;
+}
+
+static int
+vhd_journal_update(vhd_journal_t *j, off_t offset,
+ char *buf, size_t size, uint32_t type)
+{
+ int err;
+ off_t eof;
+ uint64_t *off, off_bak;
+ uint32_t *entries;
+ vhd_journal_entry_t entry;
+
+ entry.type = type;
+ entry.size = size;
+ entry.offset = offset;
+ entry.cookie = VHD_JOURNAL_ENTRY_COOKIE;
+ entry.checksum = vhd_journal_checksum_entry(&entry, buf, size);
+
+ err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_journal_write_entry(j, &entry);
+ if (err)
+ goto fail;
+
+ err = vhd_journal_write(j, buf, size);
+ if (err)
+ goto fail;
+
+ if (type == VHD_JOURNAL_ENTRY_TYPE_DATA) {
+ off = &j->header.journal_data_offset;
+ entries = &j->header.journal_data_entries;
+ } else {
+ off = &j->header.journal_metadata_offset;
+ entries = &j->header.journal_metadata_entries;
+ }
+
+ off_bak = *off;
+ if (!(*entries)++)
+ *off = j->header.journal_eof;
+ j->header.journal_eof += (size + sizeof(vhd_journal_entry_t));
+
+ err = vhd_journal_write_header(j, &j->header);
+ if (err) {
+ if (!--(*entries))
+ *off = off_bak;
+ j->header.journal_eof -= (size + sizeof(vhd_journal_entry_t));
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ if (!j->is_block)
+ vhd_journal_truncate(j, j->header.journal_eof);
+ return err;
+}
+
+static int
+vhd_journal_add_footer(vhd_journal_t *j)
+{
+ int err;
+ off_t off;
+ vhd_context_t *vhd;
+ vhd_footer_t footer;
+
+ vhd = &j->vhd;
+
+ err = vhd_seek(vhd, 0, SEEK_END);
+ if (err)
+ return err;
+
+ off = vhd_position(vhd);
+ if (off == (off_t)-1)
+ return -errno;
+
+ err = vhd_read_footer_at(vhd, &footer, off - sizeof(vhd_footer_t));
+ if (err)
+ return err;
+
+ vhd_footer_out(&footer);
+ err = vhd_journal_update(j, off - sizeof(vhd_footer_t),
+ (char *)&footer,
+ sizeof(vhd_footer_t),
+ VHD_JOURNAL_ENTRY_TYPE_FOOTER_P);
+ if (err)
+ return err;
+
+ if (!vhd_type_dynamic(vhd))
+ return 0;
+
+ err = vhd_read_footer_at(vhd, &footer, 0);
+ if (err)
+ return err;
+
+ vhd_footer_out(&footer);
+ err = vhd_journal_update(j, 0,
+ (char *)&footer,
+ sizeof(vhd_footer_t),
+ VHD_JOURNAL_ENTRY_TYPE_FOOTER_C);
+
+ return err;
+}
+
+static int
+vhd_journal_add_header(vhd_journal_t *j)
+{
+ int err;
+ off_t off;
+ vhd_context_t *vhd;
+ vhd_header_t header;
+
+ vhd = &j->vhd;
+
+ err = vhd_read_header(vhd, &header);
+ if (err)
+ return err;
+
+ off = vhd->footer.data_offset;
+
+ vhd_header_out(&header);
+ err = vhd_journal_update(j, off,
+ (char *)&header,
+ sizeof(vhd_header_t),
+ VHD_JOURNAL_ENTRY_TYPE_HEADER);
+
+ return err;
+}
+
+static int
+vhd_journal_add_locators(vhd_journal_t *j)
+{
+ int i, n, err;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+
+ err = vhd_get_header(vhd);
+ if (err)
+ return err;
+
+ n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+ for (i = 0; i < n; i++) {
+ char *buf;
+ off_t off;
+ size_t size;
+ vhd_parent_locator_t *loc;
+
+ loc = vhd->header.loc + i;
+ err = vhd_validate_platform_code(loc->code);
+ if (err)
+ return err;
+
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ off = loc->data_offset;
+ size = vhd_parent_locator_size(loc);
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err)
+ return -err;
+
+ err = vhd_seek(vhd, off, SEEK_SET);
+ if (err)
+ goto end;
+
+ err = vhd_read(vhd, buf, size);
+ if (err)
+ goto end;
+
+ err = vhd_journal_update(j, off, buf, size,
+ VHD_JOURNAL_ENTRY_TYPE_LOCATOR);
+ if (err)
+ goto end;
+
+ err = 0;
+
+ end:
+ free(buf);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static int
+vhd_journal_add_bat(vhd_journal_t *j)
+{
+ int err;
+ off_t off;
+ size_t size;
+ vhd_bat_t bat;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+
+ err = vhd_get_header(vhd);
+ if (err)
+ return err;
+
+ err = vhd_read_bat(vhd, &bat);
+ if (err)
+ return err;
+
+ off = vhd->header.table_offset;
+ size = vhd_bytes_padded(bat.entries * sizeof(uint32_t));
+
+ vhd_bat_out(&bat);
+ err = vhd_journal_update(j, off, (char *)bat.bat, size,
+ VHD_JOURNAL_ENTRY_TYPE_BAT);
+
+ free(bat.bat);
+ return err;
+}
+
+static int
+vhd_journal_add_batmap(vhd_journal_t *j)
+{
+ int err;
+ off_t off;
+ size_t size;
+ vhd_context_t *vhd;
+ vhd_batmap_t batmap;
+
+ vhd = &j->vhd;
+
+ err = vhd_batmap_header_offset(vhd, &off);
+ if (err)
+ return err;
+
+ err = vhd_read_batmap(vhd, &batmap);
+ if (err)
+ return err;
+
+ size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+ vhd_batmap_header_out(&batmap);
+ err = vhd_journal_update(j, off, (char *)&batmap.header, size,
+ VHD_JOURNAL_ENTRY_TYPE_BATMAP_H);
+ if (err)
+ goto out;
+
+ vhd_batmap_header_in(&batmap);
+ off = batmap.header.batmap_offset;
+ size = vhd_sectors_to_bytes(batmap.header.batmap_size);
+
+ err = vhd_journal_update(j, off, batmap.map, size,
+ VHD_JOURNAL_ENTRY_TYPE_BATMAP_M);
+
+out:
+ free(batmap.map);
+ return err;
+}
+
+static int
+vhd_journal_add_metadata(vhd_journal_t *j)
+{
+ int err;
+ off_t eof;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+
+ err = vhd_journal_add_footer(j);
+ if (err)
+ return err;
+
+ if (!vhd_type_dynamic(vhd))
+ return 0;
+
+ err = vhd_journal_add_header(j);
+ if (err)
+ return err;
+
+ err = vhd_journal_add_locators(j);
+ if (err)
+ return err;
+
+ err = vhd_journal_add_bat(j);
+ if (err)
+ return err;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_journal_add_batmap(j);
+ if (err)
+ return err;
+ }
+
+ j->header.journal_data_offset = j->header.journal_eof;
+ return vhd_journal_write_header(j, &j->header);
+}
+
+static int
+__vhd_journal_read_footer(vhd_journal_t *j,
+ vhd_footer_t *footer, uint32_t type)
+{
+ int err;
+ vhd_journal_entry_t entry;
+
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ return err;
+
+ if (entry.type != type)
+ return -EINVAL;
+
+ if (entry.size != sizeof(vhd_footer_t))
+ return -EINVAL;
+
+ err = vhd_journal_read(j, footer, entry.size);
+ if (err)
+ return err;
+
+ vhd_footer_in(footer);
+ return vhd_validate_footer(footer);
+}
+
+static int
+vhd_journal_read_footer(vhd_journal_t *j, vhd_footer_t *footer)
+{
+ return __vhd_journal_read_footer(j, footer,
+ VHD_JOURNAL_ENTRY_TYPE_FOOTER_P);
+}
+
+static int
+vhd_journal_read_footer_copy(vhd_journal_t *j, vhd_footer_t *footer)
+{
+ return __vhd_journal_read_footer(j, footer,
+ VHD_JOURNAL_ENTRY_TYPE_FOOTER_C);
+}
+
+static int
+vhd_journal_read_header(vhd_journal_t *j, vhd_header_t *header)
+{
+ int err;
+ vhd_journal_entry_t entry;
+
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ return err;
+
+ if (entry.type != VHD_JOURNAL_ENTRY_TYPE_HEADER)
+ return -EINVAL;
+
+ if (entry.size != sizeof(vhd_header_t))
+ return -EINVAL;
+
+ err = vhd_journal_read(j, header, entry.size);
+ if (err)
+ return err;
+
+ vhd_header_in(header);
+ return vhd_validate_header(header);
+}
+
+static int
+vhd_journal_read_locators(vhd_journal_t *j, char ***locators, int *locs)
+{
+ int err, n, _locs;
+ char **_locators, *buf;
+ off_t pos;
+ vhd_journal_entry_t entry;
+
+ _locs = 0;
+ *locs = 0;
+ *locators = NULL;
+
+ n = sizeof(j->vhd.header.loc) / sizeof(vhd_parent_locator_t);
+ _locators = calloc(n, sizeof(char *));
+ if (!_locators)
+ return -ENOMEM;
+
+ for (;;) {
+ buf = NULL;
+
+ pos = vhd_journal_position(j);
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ goto fail;
+
+ if (entry.type != VHD_JOURNAL_ENTRY_TYPE_LOCATOR) {
+ err = vhd_journal_seek(j, pos, SEEK_SET);
+ if (err)
+ goto fail;
+ break;
+ }
+
+ if (_locs >= n) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ err = posix_memalign((void **)&buf,
+ VHD_SECTOR_SIZE, entry.size);
+ if (err) {
+ err = -err;
+ buf = NULL;
+ goto fail;
+ }
+
+ err = vhd_journal_read(j, buf, entry.size);
+ if (err)
+ goto fail;
+
+ _locators[_locs++] = buf;
+ err = 0;
+ }
+
+
+ *locs = _locs;
+ *locators = _locators;
+
+ return 0;
+
+fail:
+ if (_locators) {
+ for (n = 0; n < _locs; n++)
+ free(_locators[n]);
+ free(_locators);
+ }
+ return err;
+}
+
+static int
+vhd_journal_read_bat(vhd_journal_t *j, vhd_bat_t *bat)
+{
+ int err;
+ size_t size;
+ vhd_context_t *vhd;
+ vhd_journal_entry_t entry;
+
+ vhd = &j->vhd;
+
+ size = vhd_bytes_padded(vhd->header.max_bat_size * sizeof(uint32_t));
+
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ return err;
+
+ if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BAT)
+ return -EINVAL;
+
+ if (entry.size != size)
+ return -EINVAL;
+
+ if (entry.offset != vhd->header.table_offset)
+ return -EINVAL;
+
+ err = posix_memalign((void **)&bat->bat, VHD_SECTOR_SIZE, size);
+ if (err)
+ return -err;
+
+ err = vhd_journal_read(j, bat->bat, entry.size);
+ if (err)
+ goto fail;
+
+ bat->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT;
+ bat->entries = vhd->header.max_bat_size;
+ vhd_bat_in(bat);
+
+ return 0;
+
+fail:
+ free(bat->bat);
+ bat->bat = NULL;
+ return err;
+}
+
+static int
+vhd_journal_read_batmap_header(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+ int err;
+ char *buf;
+ size_t size;
+ vhd_journal_entry_t entry;
+
+ size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ return err;
+
+ if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_H)
+ return -EINVAL;
+
+ if (entry.size != size)
+ return -EINVAL;
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err)
+ return err;
+
+ err = vhd_journal_read(j, buf, entry.size);
+ if (err) {
+ free(buf);
+ return err;
+ }
+
+ memcpy(&batmap->header, buf, sizeof(batmap->header));
+
+ vhd_batmap_header_in(batmap);
+ return vhd_validate_batmap_header(batmap);
+}
+
+static int
+vhd_journal_read_batmap_map(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+ int err;
+ vhd_journal_entry_t entry;
+
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ return err;
+
+ if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_M)
+ return -EINVAL;
+
+ if (entry.size != vhd_sectors_to_bytes(batmap->header.batmap_size))
+ return -EINVAL;
+
+ if (entry.offset != batmap->header.batmap_offset)
+ return -EINVAL;
+
+ err = posix_memalign((void **)&batmap->map,
+ VHD_SECTOR_SIZE, entry.size);
+ if (err)
+ return -err;
+
+ err = vhd_journal_read(j, batmap->map, entry.size);
+ if (err) {
+ free(batmap->map);
+ batmap->map = NULL;
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+vhd_journal_read_batmap(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+ int err;
+
+ err = vhd_journal_read_batmap_header(j, batmap);
+ if (err)
+ return err;
+
+ err = vhd_journal_read_batmap_map(j, batmap);
+ if (err)
+ return err;
+
+ err = vhd_validate_batmap(batmap);
+ if (err) {
+ free(batmap->map);
+ batmap->map = NULL;
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+vhd_journal_restore_footer(vhd_journal_t *j, vhd_footer_t *footer)
+{
+ return vhd_write_footer_at(&j->vhd, footer,
+ j->header.vhd_footer_offset);
+}
+
+static int
+vhd_journal_restore_footer_copy(vhd_journal_t *j, vhd_footer_t *footer)
+{
+ return vhd_write_footer_at(&j->vhd, footer, 0);
+}
+
+static int
+vhd_journal_restore_header(vhd_journal_t *j, vhd_header_t *header)
+{
+ off_t off;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+ off = vhd->footer.data_offset;
+
+ return vhd_write_header_at(&j->vhd, header, off);
+}
+
+static int
+vhd_journal_restore_locators(vhd_journal_t *j, char **locators, int locs)
+{
+ size_t size;
+ vhd_context_t *vhd;
+ int i, n, lidx, err;
+ vhd_parent_locator_t *loc;
+
+ lidx = 0;
+ vhd = &j->vhd;
+
+ n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+
+ for (i = 0; i < n && lidx < locs; i++) {
+ loc = vhd->header.loc + i;
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ err = vhd_seek(vhd, loc->data_offset, SEEK_SET);
+ if (err)
+ return err;
+
+ size = vhd_parent_locator_size(loc);
+ err = vhd_write(vhd, locators[lidx++], size);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+vhd_journal_restore_bat(vhd_journal_t *j, vhd_bat_t *bat)
+{
+ return vhd_write_bat(&j->vhd, bat);
+}
+
+static int
+vhd_journal_restore_batmap(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+ return vhd_write_batmap(&j->vhd, batmap);
+}
+
+static int
+vhd_journal_restore_metadata(vhd_journal_t *j)
+{
+ off_t off;
+ char **locators;
+ vhd_footer_t copy;
+ vhd_context_t *vhd;
+ int i, locs, hlocs, err;
+
+ vhd = &j->vhd;
+ locs = 0;
+ hlocs = 0;
+ locators = NULL;
+
+ err = vhd_journal_seek(j, sizeof(vhd_journal_header_t), SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_journal_read_footer(j, &vhd->footer);
+ if (err)
+ return err;
+
+ if (!vhd_type_dynamic(vhd))
+ goto restore;
+
+ err = vhd_journal_read_footer_copy(j, ©);
+ if (err)
+ return err;
+
+ err = vhd_journal_read_header(j, &vhd->header);
+ if (err)
+ return err;
+
+ for (hlocs = 0, i = 0; i < vhd_parent_locator_count(vhd); i++) {
+ if (vhd_validate_platform_code(vhd->header.loc[i].code))
+ return err;
+
+ if (vhd->header.loc[i].code != PLAT_CODE_NONE)
+ hlocs++;
+ }
+
+ if (hlocs) {
+ err = vhd_journal_read_locators(j, &locators, &locs);
+ if (err)
+ return err;
+
+ if (hlocs != locs) {
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ err = vhd_journal_read_bat(j, &vhd->bat);
+ if (err)
+ goto out;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_journal_read_batmap(j, &vhd->batmap);
+ if (err)
+ goto out;
+ }
+
+restore:
+ off = vhd_journal_position(j);
+ if (off == (off_t)-1)
+ return -errno;
+
+ if (j->header.journal_data_offset != off)
+ return -EINVAL;
+
+ err = vhd_journal_restore_footer(j, &vhd->footer);
+ if (err)
+ goto out;
+
+ if (!vhd_type_dynamic(vhd))
+ goto out;
+
+ err = vhd_journal_restore_footer_copy(j, ©);
+ if (err)
+ goto out;
+
+ err = vhd_journal_restore_header(j, &vhd->header);
+ if (err)
+ goto out;
+
+ if (locs) {
+ err = vhd_journal_restore_locators(j, locators, locs);
+ if (err)
+ goto out;
+ }
+
+ err = vhd_journal_restore_bat(j, &vhd->bat);
+ if (err)
+ goto out;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_journal_restore_batmap(j, &vhd->batmap);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ if (locators) {
+ for (i = 0; i < locs; i++)
+ free(locators[i]);
+ free(locators);
+ }
+
+ if (!err && !vhd->is_block)
+ err = ftruncate(vhd->fd,
+ j->header.vhd_footer_offset +
+ sizeof(vhd_footer_t));
+
+ return err;
+}
+
+static int
+vhd_journal_disable_vhd(vhd_journal_t *j)
+{
+ int err;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+
+ err = vhd_get_footer(vhd);
+ if (err)
+ return err;
+
+ memcpy(&vhd->footer.cookie,
+ VHD_POISON_COOKIE, sizeof(vhd->footer.cookie));
+ vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+
+ err = vhd_write_footer(vhd, &vhd->footer);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int
+vhd_journal_enable_vhd(vhd_journal_t *j)
+{
+ int err;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+
+ err = vhd_get_footer(vhd);
+ if (err)
+ return err;
+
+ if (!vhd_disabled(vhd))
+ return 0;
+
+ memcpy(&vhd->footer.cookie, HD_COOKIE, sizeof(vhd->footer.cookie));
+ vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+
+ err = vhd_write_footer(vhd, &vhd->footer);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int
+vhd_journal_close(vhd_journal_t *j)
+{
+ if (j->jfd)
+ close(j->jfd);
+
+ vhd_close(&j->vhd);
+ free(j->jname);
+
+ return 0;
+}
+
+int
+vhd_journal_remove(vhd_journal_t *j)
+{
+ int err;
+
+ err = vhd_journal_enable_vhd(j);
+ if (err)
+ return err;
+
+ if (j->jfd) {
+ close(j->jfd);
+ if (!j->is_block)
+ unlink(j->jname);
+ }
+
+ vhd_close(&j->vhd);
+ free(j->jname);
+
+ return 0;
+}
+
+int
+vhd_journal_open(vhd_journal_t *j, const char *file, const char *jfile)
+{
+ int err;
+ vhd_context_t *vhd;
+
+ memset(j, 0, sizeof(vhd_journal_t));
+
+ j->jfd = -1;
+ vhd = &j->vhd;
+
+ j->jname = strdup(jfile);
+ if (j->jname == NULL)
+ return -ENOMEM;
+
+ j->jfd = open(j->jname, O_LARGEFILE | O_RDWR);
+ if (j->jfd == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = vhd_test_file_fixed(j->jname, &j->is_block);
+ if (err)
+ goto fail;
+
+ vhd->fd = open(file, O_LARGEFILE | O_RDWR | O_DIRECT);
+ if (vhd->fd == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = vhd_test_file_fixed(file, &vhd->is_block);
+ if (err)
+ goto fail;
+
+ err = vhd_journal_read_journal_header(j, &j->header);
+ if (err)
+ goto fail;
+
+ err = vhd_journal_restore_metadata(j);
+ if (err)
+ goto fail;
+
+ close(vhd->fd);
+ free(vhd->bat.bat);
+ free(vhd->batmap.map);
+
+ err = vhd_open(vhd, file, VHD_OPEN_RDWR);
+ if (err)
+ goto fail;
+
+ err = vhd_get_bat(vhd);
+ if (err)
+ goto fail;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_get_batmap(vhd);
+ if (err)
+ goto fail;
+ }
+
+ err = vhd_journal_disable_vhd(j);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ vhd_journal_close(j);
+ return err;
+}
+
+int
+vhd_journal_create(vhd_journal_t *j, const char *file, const char *jfile)
+{
+ char *buf;
+ int i, err;
+ size_t size;
+ off_t off;
+ struct stat stats;
+
+ memset(j, 0, sizeof(vhd_journal_t));
+ j->jfd = -1;
+
+ j->jname = strdup(jfile);
+ if (j->jname == NULL) {
+ err = -ENOMEM;
+ goto fail1;
+ }
+
+ if (access(j->jname, F_OK) == 0) {
+ err = vhd_test_file_fixed(j->jname, &j->is_block);
+ if (err)
+ goto fail1;
+
+ if (!j->is_block) {
+ err = -EEXIST;
+ goto fail1;
+ }
+ }
+
+ if (j->is_block)
+ j->jfd = open(j->jname, O_LARGEFILE | O_RDWR, 0644);
+ else
+ j->jfd = open(j->jname,
+ O_CREAT | O_TRUNC | O_LARGEFILE | O_RDWR, 0644);
+ if (j->jfd == -1) {
+ err = -errno;
+ goto fail1;
+ }
+
+ err = vhd_open(&j->vhd, file, VHD_OPEN_RDWR | VHD_OPEN_STRICT);
+ if (err)
+ goto fail1;
+
+ err = vhd_get_bat(&j->vhd);
+ if (err)
+ goto fail2;
+
+ if (vhd_has_batmap(&j->vhd)) {
+ err = vhd_get_batmap(&j->vhd);
+ if (err)
+ goto fail2;
+ }
+
+ err = vhd_journal_add_journal_header(j);
+ if (err)
+ goto fail2;
+
+ err = vhd_journal_add_metadata(j);
+ if (err)
+ goto fail2;
+
+ err = vhd_journal_disable_vhd(j);
+ if (err)
+ goto fail2;
+
+ err = vhd_journal_sync(j);
+ if (err)
+ goto fail2;
+
+ return 0;
+
+fail1:
+ if (j->jfd != -1) {
+ close(j->jfd);
+ if (!j->is_block)
+ unlink(j->jname);
+ }
+ free(j->jname);
+ memset(j, 0, sizeof(vhd_journal_t));
+
+ return err;
+
+fail2:
+ vhd_journal_remove(j);
+ return err;
+}
+
+int
+vhd_journal_add_block(vhd_journal_t *j, uint32_t block, char mode)
+{
+ int err;
+ char *buf;
+ off_t off;
+ size_t size;
+ uint64_t blk;
+ vhd_context_t *vhd;
+
+ buf = NULL;
+ vhd = &j->vhd;
+
+ if (!vhd_type_dynamic(vhd))
+ return -EINVAL;
+
+ err = vhd_get_bat(vhd);
+ if (err)
+ return err;
+
+ if (block >= vhd->bat.entries)
+ return -ERANGE;
+
+ blk = vhd->bat.bat[block];
+ if (blk == DD_BLK_UNUSED)
+ return 0;
+
+ off = vhd_sectors_to_bytes(blk);
+
+ if (mode & VHD_JOURNAL_METADATA) {
+ size = vhd_sectors_to_bytes(vhd->bm_secs);
+
+ err = vhd_read_bitmap(vhd, block, &buf);
+ if (err)
+ return err;
+
+ err = vhd_journal_update(j, off, buf, size,
+ VHD_JOURNAL_ENTRY_TYPE_DATA);
+
+ free(buf);
+
+ if (err)
+ return err;
+ }
+
+ if (mode & VHD_JOURNAL_DATA) {
+ off += vhd_sectors_to_bytes(vhd->bm_secs);
+ size = vhd_sectors_to_bytes(vhd->spb);
+
+ err = vhd_read_block(vhd, block, &buf);
+ if (err)
+ return err;
+
+ err = vhd_journal_update(j, off, buf, size,
+ VHD_JOURNAL_ENTRY_TYPE_DATA);
+ free(buf);
+
+ if (err)
+ return err;
+ }
+
+ return vhd_journal_sync(j);
+}
+
+/*
+ * commit indicates the transaction completed
+ * successfully and we can remove the undo log
+ */
+int
+vhd_journal_commit(vhd_journal_t *j)
+{
+ int err;
+
+ j->header.journal_data_entries = 0;
+ j->header.journal_metadata_entries = 0;
+ j->header.journal_data_offset = 0;
+ j->header.journal_metadata_offset = 0;
+
+ err = vhd_journal_write_header(j, &j->header);
+ if (err)
+ return err;
+
+ if (!j->is_block)
+ err = vhd_journal_truncate(j, sizeof(vhd_journal_header_t));
+ if (err)
+ return -errno;
+
+ return 0;
+}
+
+/*
+ * revert indicates the transaction failed
+ * and we should revert any changes via the undo log
+ */
+int
+vhd_journal_revert(vhd_journal_t *j)
+{
+ int i, err;
+ char *buf, *file;
+ vhd_context_t *vhd;
+ vhd_journal_entry_t entry;
+
+ err = 0;
+ vhd = &j->vhd;
+ buf = NULL;
+
+ file = strdup(vhd->file);
+ if (!file)
+ return -ENOMEM;
+
+ vhd_close(&j->vhd);
+ j->vhd.fd = open(file, O_RDWR | O_DIRECT | O_LARGEFILE);
+ if (j->vhd.fd == -1) {
+ free(file);
+ return -errno;
+ }
+
+ err = vhd_test_file_fixed(file, &vhd->is_block);
+ if (err) {
+ free(file);
+ return err;
+ }
+
+ err = vhd_journal_restore_metadata(j);
+ if (err) {
+ free(file);
+ return err;
+ }
+
+ close(vhd->fd);
+ free(vhd->bat.bat);
+ free(vhd->batmap.map);
+
+ err = vhd_open(vhd, file, VHD_OPEN_RDWR);
+ free(file);
+ if (err)
+ return err;
+
+ err = vhd_journal_seek(j, j->header.journal_data_offset, SEEK_SET);
+ if (err)
+ return err;
+
+ for (i = 0; i < j->header.journal_data_entries; i++) {
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ goto end;
+
+ err = posix_memalign((void **)&buf,
+ VHD_SECTOR_SIZE, entry.size);
+ if (err) {
+ err = -err;
+ buf = NULL;
+ goto end;
+ }
+
+ err = vhd_journal_read(j, buf, entry.size);
+ if (err)
+ goto end;
+
+ err = vhd_journal_validate_entry_data(&entry, buf);
+ if (err)
+ goto end;
+
+ err = vhd_seek(vhd, entry.offset, SEEK_SET);
+ if (err)
+ goto end;
+
+ err = vhd_write(vhd, buf, entry.size);
+ if (err)
+ goto end;
+
+ err = 0;
+
+ end:
+ free(buf);
+ buf = NULL;
+ if (err)
+ break;
+ }
+
+ if (err)
+ return err;
+
+ if (!vhd->is_block) {
+ err = ftruncate(vhd->fd, j->header.vhd_footer_offset +
+ sizeof(vhd_footer_t));
+ if (err)
+ return -errno;
+ }
+
+ return vhd_journal_sync(j);
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <libgen.h>
+#include <iconv.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <langinfo.h>
+
+#include "libvhd.h"
+#include "relative-path.h"
+
+/* VHD uses an epoch of 12:00AM, Jan 1, 2000. This is the Unix timestamp for
+ * the start of the VHD epoch. */
+#define VHD_EPOCH_START 946684800
+
+static int libvhd_dbg = 0;
+
+void
+libvhd_set_log_level(int level)
+{
+ if (level)
+ libvhd_dbg = 1;
+}
+
+#define VHDLOG(_f, _a...) \
+ do { \
+ if (libvhd_dbg) \
+ syslog(LOG_INFO, "libvhd::%s: "_f, \
+ __func__, ##_a); \
+ } while (0)
+
+#define BIT_MASK 0x80
+
+#ifdef ENABLE_FAILURE_TESTING
+const char* ENV_VAR_FAIL[NUM_FAIL_TESTS] = {
+ "VHD_UTIL_TEST_FAIL_REPARENT_BEGIN",
+ "VHD_UTIL_TEST_FAIL_REPARENT_LOCATOR",
+ "VHD_UTIL_TEST_FAIL_REPARENT_END",
+ "VHD_UTIL_TEST_FAIL_RESIZE_BEGIN",
+ "VHD_UTIL_TEST_FAIL_RESIZE_DATA_MOVED",
+ "VHD_UTIL_TEST_FAIL_RESIZE_METADATA_MOVED",
+ "VHD_UTIL_TEST_FAIL_RESIZE_END"
+};
+int TEST_FAIL[NUM_FAIL_TESTS];
+#endif // ENABLE_FAILURE_TESTING
+
+static inline int
+test_bit (volatile char *addr, int nr)
+{
+ return ((addr[nr >> 3] << (nr & 7)) & BIT_MASK) != 0;
+}
+
+static inline void
+set_bit (volatile char *addr, int nr)
+{
+ addr[nr >> 3] |= (BIT_MASK >> (nr & 7));
+}
+
+static inline void
+clear_bit (volatile char *addr, int nr)
+{
+ addr[nr >> 3] &= ~(BIT_MASK >> (nr & 7));
+}
+
+static inline int
+old_test_bit(volatile char *addr, int nr)
+{
+ return (((uint32_t *)addr)[nr >> 5] >> (nr & 31)) & 1;
+}
+
+static inline void
+old_set_bit(volatile char *addr, int nr)
+{
+ ((uint32_t *)addr)[nr >> 5] |= (1 << (nr & 31));
+}
+
+static inline void
+old_clear_bit(volatile char *addr, int nr)
+{
+ ((uint32_t *)addr)[nr >> 5] &= ~(1 << (nr & 31));
+}
+
+void
+vhd_footer_in(vhd_footer_t *footer)
+{
+ BE32_IN(&footer->features);
+ BE32_IN(&footer->ff_version);
+ BE64_IN(&footer->data_offset);
+ BE32_IN(&footer->timestamp);
+ BE32_IN(&footer->crtr_ver);
+ BE32_IN(&footer->crtr_os);
+ BE64_IN(&footer->orig_size);
+ BE64_IN(&footer->curr_size);
+ BE32_IN(&footer->geometry);
+ BE32_IN(&footer->type);
+ BE32_IN(&footer->checksum);
+}
+
+void
+vhd_footer_out(vhd_footer_t *footer)
+{
+ BE32_OUT(&footer->features);
+ BE32_OUT(&footer->ff_version);
+ BE64_OUT(&footer->data_offset);
+ BE32_OUT(&footer->timestamp);
+ BE32_OUT(&footer->crtr_ver);
+ BE32_OUT(&footer->crtr_os);
+ BE64_OUT(&footer->orig_size);
+ BE64_OUT(&footer->curr_size);
+ BE32_OUT(&footer->geometry);
+ BE32_OUT(&footer->type);
+ BE32_OUT(&footer->checksum);
+}
+
+void
+vhd_header_in(vhd_header_t *header)
+{
+ int i, n;
+
+ BE64_IN(&header->data_offset);
+ BE64_IN(&header->table_offset);
+ BE32_IN(&header->hdr_ver);
+ BE32_IN(&header->max_bat_size);
+ BE32_IN(&header->block_size);
+ BE32_IN(&header->checksum);
+ BE32_IN(&header->prt_ts);
+
+ n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+
+ for (i = 0; i < n; i++) {
+ BE32_IN(&header->loc[i].code);
+ BE32_IN(&header->loc[i].data_space);
+ BE32_IN(&header->loc[i].data_len);
+ BE64_IN(&header->loc[i].data_offset);
+ }
+}
+
+void
+vhd_header_out(vhd_header_t *header)
+{
+ int i, n;
+
+ BE64_OUT(&header->data_offset);
+ BE64_OUT(&header->table_offset);
+ BE32_OUT(&header->hdr_ver);
+ BE32_OUT(&header->max_bat_size);
+ BE32_OUT(&header->block_size);
+ BE32_OUT(&header->checksum);
+ BE32_OUT(&header->prt_ts);
+
+ n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+
+ for (i = 0; i < n; i++) {
+ BE32_OUT(&header->loc[i].code);
+ BE32_OUT(&header->loc[i].data_space);
+ BE32_OUT(&header->loc[i].data_len);
+ BE64_OUT(&header->loc[i].data_offset);
+ }
+}
+
+void
+vhd_batmap_header_in(vhd_batmap_t *batmap)
+{
+ BE64_IN(&batmap->header.batmap_offset);
+ BE32_IN(&batmap->header.batmap_size);
+ BE32_IN(&batmap->header.batmap_version);
+ BE32_IN(&batmap->header.checksum);
+}
+
+void
+vhd_batmap_header_out(vhd_batmap_t *batmap)
+{
+ BE64_OUT(&batmap->header.batmap_offset);
+ BE32_OUT(&batmap->header.batmap_size);
+ BE32_OUT(&batmap->header.batmap_version);
+ BE32_OUT(&batmap->header.checksum);
+}
+
+void
+vhd_bat_in(vhd_bat_t *bat)
+{
+ int i;
+
+ for (i = 0; i < bat->entries; i++)
+ BE32_IN(&bat->bat[i]);
+}
+
+void
+vhd_bat_out(vhd_bat_t *bat)
+{
+ int i;
+
+ for (i = 0; i < bat->entries; i++)
+ BE32_OUT(&bat->bat[i]);
+}
+
+uint32_t
+vhd_checksum_footer(vhd_footer_t *footer)
+{
+ int i;
+ unsigned char *blob;
+ uint32_t checksum, tmp;
+
+ checksum = 0;
+ tmp = footer->checksum;
+ footer->checksum = 0;
+
+ blob = (unsigned char *)footer;
+ for (i = 0; i < sizeof(vhd_footer_t); i++)
+ checksum += (uint32_t)blob[i];
+
+ footer->checksum = tmp;
+ return ~checksum;
+}
+
+int
+vhd_validate_footer(vhd_footer_t *footer)
+{
+ int csize;
+ uint32_t checksum;
+
+ csize = sizeof(footer->cookie);
+ if (memcmp(footer->cookie, HD_COOKIE, csize) != 0 &&
+ memcmp(footer->cookie, VHD_POISON_COOKIE, csize) != 0) {
+ char buf[9];
+ strncpy(buf, footer->cookie, sizeof(buf));
+ buf[sizeof(buf)-1]= '\0';
+ VHDLOG("invalid footer cookie: %s\n", buf);
+ return -EINVAL;
+ }
+
+ checksum = vhd_checksum_footer(footer);
+ if (checksum != footer->checksum) {
+ /*
+ * early td-util did not re-calculate
+ * checksum when marking vhds 'hidden'
+ */
+ if (footer->hidden &&
+ !strncmp(footer->crtr_app, "tap", 3) &&
+ (footer->crtr_ver == VHD_VERSION(0, 1) ||
+ footer->crtr_ver == VHD_VERSION(1, 1))) {
+ char tmp = footer->hidden;
+ footer->hidden = 0;
+ checksum = vhd_checksum_footer(footer);
+ footer->hidden = tmp;
+
+ if (checksum == footer->checksum)
+ return 0;
+ }
+
+ VHDLOG("invalid footer checksum: "
+ "footer = 0x%08x, calculated = 0x%08x\n",
+ footer->checksum, checksum);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+uint32_t
+vhd_checksum_header(vhd_header_t *header)
+{
+ int i;
+ unsigned char *blob;
+ uint32_t checksum, tmp;
+
+ checksum = 0;
+ tmp = header->checksum;
+ header->checksum = 0;
+
+ blob = (unsigned char *)header;
+ for (i = 0; i < sizeof(vhd_header_t); i++)
+ checksum += (uint32_t)blob[i];
+
+ header->checksum = tmp;
+ return ~checksum;
+}
+
+int
+vhd_validate_header(vhd_header_t *header)
+{
+ int i, n;
+ uint32_t checksum;
+
+ if (memcmp(header->cookie, DD_COOKIE, 8) != 0) {
+ char buf[9];
+ strncpy(buf, header->cookie, sizeof(buf));
+ buf[sizeof(buf)-1]= '\0';
+ VHDLOG("invalid header cookie: %s\n", buf);
+ return -EINVAL;
+ }
+
+ if (header->hdr_ver != 0x00010000) {
+ VHDLOG("invalid header version 0x%08x\n", header->hdr_ver);
+ return -EINVAL;
+ }
+
+ if (header->data_offset != 0xFFFFFFFFFFFFFFFF) {
+ VHDLOG("invalid header data_offset 0x%016"PRIx64"\n",
+ header->data_offset);
+ return -EINVAL;
+ }
+
+ n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+ for (i = 0; i < n; i++)
+ if (vhd_validate_platform_code(header->loc[i].code))
+ return -EINVAL;
+
+ checksum = vhd_checksum_header(header);
+ if (checksum != header->checksum) {
+ VHDLOG("invalid header checksum: "
+ "header = 0x%08x, calculated = 0x%08x\n",
+ header->checksum, checksum);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static inline int
+vhd_validate_bat(vhd_bat_t *bat)
+{
+ if (!bat->bat)
+ return -EINVAL;
+
+ return 0;
+}
+
+uint32_t
+vhd_checksum_batmap(vhd_batmap_t *batmap)
+{
+ int i, n;
+ char *blob;
+ uint32_t checksum;
+
+ blob = batmap->map;
+ checksum = 0;
+
+ n = vhd_sectors_to_bytes(batmap->header.batmap_size);
+
+ for (i = 0; i < n; i++) {
+ if (batmap->header.batmap_version == VHD_BATMAP_VERSION(1, 1))
+ checksum += (uint32_t)blob[i];
+ else
+ checksum += (uint32_t)(unsigned char)blob[i];
+ }
+
+ return ~checksum;
+}
+
+int
+vhd_validate_batmap_header(vhd_batmap_t *batmap)
+{
+ if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, 8))
+ return -EINVAL;
+
+ if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION)
+ return -EINVAL;
+
+ return 0;
+}
+
+int
+vhd_validate_batmap(vhd_batmap_t *batmap)
+{
+ uint32_t checksum;
+
+ if (!batmap->map)
+ return -EINVAL;
+
+ checksum = vhd_checksum_batmap(batmap);
+ if (checksum != batmap->header.checksum)
+ return -EINVAL;
+
+ return 0;
+}
+
+int
+vhd_batmap_header_offset(vhd_context_t *ctx, off_t *_off)
+{
+ off_t off;
+ size_t bat;
+
+ *_off = 0;
+
+ off = ctx->header.table_offset;
+ bat = ctx->header.max_bat_size * sizeof(uint32_t);
+ off += vhd_bytes_padded(bat);
+
+ *_off = off;
+ return 0;
+}
+
+int
+vhd_validate_platform_code(uint32_t code)
+{
+ switch (code) {
+ case PLAT_CODE_NONE:
+ case PLAT_CODE_WI2R:
+ case PLAT_CODE_WI2K:
+ case PLAT_CODE_W2RU:
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_MAC:
+ case PLAT_CODE_MACX:
+ return 0;
+ default:
+ VHDLOG("invalid parent locator code %u\n", code);
+ return -EINVAL;
+ }
+}
+
+int
+vhd_parent_locator_count(vhd_context_t *ctx)
+{
+ return (sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t));
+}
+
+int
+vhd_hidden(vhd_context_t *ctx, int *hidden)
+{
+ int err;
+
+ *hidden = 0;
+
+ if (vhd_type_dynamic(ctx) && vhd_creator_tapdisk(ctx) &&
+ (ctx->footer.crtr_ver == VHD_VERSION(0, 1) ||
+ ctx->footer.crtr_ver == VHD_VERSION(1, 1))) {
+ vhd_footer_t copy;
+
+ err = vhd_read_footer_at(ctx, ©, 0);
+ if (err) {
+ VHDLOG("error reading backup footer of %s: %d\n",
+ ctx->file, err);
+ return err;
+ }
+ *hidden = copy.hidden;
+ } else
+ *hidden = ctx->footer.hidden;
+
+ return 0;
+}
+
+int
+vhd_chain_depth(vhd_context_t *ctx, int *depth)
+{
+ char *file;
+ int err, cnt;
+ vhd_context_t vhd, *cur;
+
+ err = 0;
+ cnt = 0;
+ *depth = 0;
+ file = NULL;
+ cur = ctx;
+
+ for (;;) {
+ cnt++;
+
+ if (cur->footer.type != HD_TYPE_DIFF)
+ break;
+
+ if (vhd_parent_raw(cur)) {
+ cnt++;
+ break;
+ }
+
+ err = vhd_parent_locator_get(cur, &file);
+ if (err) {
+ file = NULL;
+ break;
+ }
+
+ if (cur != ctx) {
+ vhd_close(cur);
+ cur = NULL;
+ }
+
+ err = vhd_open(&vhd, file, VHD_OPEN_RDONLY);
+ if (err)
+ break;
+
+ cur = &vhd;
+ free(file);
+ file = NULL;
+ }
+
+ free(file);
+ if (cur && cur != ctx)
+ vhd_close(cur);
+
+ if (!err)
+ *depth = cnt;
+
+ return err;
+}
+
+int
+vhd_batmap_test(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+ if (!vhd_has_batmap(ctx) || !batmap->map)
+ return 0;
+
+ if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+ return 0;
+
+ return test_bit(batmap->map, block);
+}
+
+void
+vhd_batmap_set(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+ if (!vhd_has_batmap(ctx) || !batmap->map)
+ return;
+
+ if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+ return;
+
+ set_bit(batmap->map, block);
+}
+
+void
+vhd_batmap_clear(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+ if (!vhd_has_batmap(ctx) || !batmap->map)
+ return;
+
+ if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+ return;
+
+ clear_bit(batmap->map, block);
+}
+
+int
+vhd_bitmap_test(vhd_context_t *ctx, char *map, uint32_t block)
+{
+ if (vhd_creator_tapdisk(ctx) &&
+ ctx->footer.crtr_ver == 0x00000001)
+ return old_test_bit(map, block);
+
+ return test_bit(map, block);
+}
+
+void
+vhd_bitmap_set(vhd_context_t *ctx, char *map, uint32_t block)
+{
+ if (vhd_creator_tapdisk(ctx) &&
+ ctx->footer.crtr_ver == 0x00000001)
+ return old_set_bit(map, block);
+
+ return set_bit(map, block);
+}
+
+void
+vhd_bitmap_clear(vhd_context_t *ctx, char *map, uint32_t block)
+{
+ if (vhd_creator_tapdisk(ctx) &&
+ ctx->footer.crtr_ver == 0x00000001)
+ return old_clear_bit(map, block);
+
+ return clear_bit(map, block);
+}
+
+/*
+ * returns absolute offset of the first
+ * byte of the file which is not vhd metadata
+ */
+int
+vhd_end_of_headers(vhd_context_t *ctx, off_t *end)
+{
+ int err, i, n;
+ uint32_t bat_bytes;
+ off_t eom, bat_end;
+ vhd_parent_locator_t *loc;
+
+ *end = 0;
+
+ if (!vhd_type_dynamic(ctx))
+ return 0;
+
+ eom = ctx->footer.data_offset + sizeof(vhd_header_t);
+
+ bat_bytes = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+ bat_end = ctx->header.table_offset + bat_bytes;
+
+ eom = MAX(eom, bat_end);
+
+ if (vhd_has_batmap(ctx)) {
+ off_t hdr_end, hdr_secs, map_end, map_secs;
+
+ err = vhd_get_batmap(ctx);
+ if (err)
+ return err;
+
+ hdr_secs = secs_round_up_no_zero(sizeof(vhd_batmap_header_t));
+ err = vhd_batmap_header_offset(ctx, &hdr_end);
+ if (err)
+ return err;
+
+ hdr_end += vhd_sectors_to_bytes(hdr_secs);
+ eom = MAX(eom, hdr_end);
+
+ map_secs = ctx->batmap.header.batmap_size;
+ map_end = (ctx->batmap.header.batmap_offset +
+ vhd_sectors_to_bytes(map_secs));
+ eom = MAX(eom, map_end);
+ }
+
+ /* parent locators */
+ n = sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t);
+
+ for (i = 0; i < n; i++) {
+ off_t loc_end;
+
+ loc = &ctx->header.loc[i];
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ loc_end = loc->data_offset + vhd_parent_locator_size(loc);
+ eom = MAX(eom, loc_end);
+ }
+
+ *end = eom;
+ return 0;
+}
+
+int
+vhd_end_of_data(vhd_context_t *ctx, off_t *end)
+{
+ int i, err;
+ off_t max;
+ uint64_t blk;
+
+ if (!vhd_type_dynamic(ctx)) {
+ err = vhd_seek(ctx, 0, SEEK_END);
+ if (err)
+ return err;
+
+ max = vhd_position(ctx);
+ if (max == (off_t)-1)
+ return -errno;
+
+ *end = max - sizeof(vhd_footer_t);
+ return 0;
+ }
+
+ err = vhd_end_of_headers(ctx, &max);
+ if (err)
+ return err;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ max >>= VHD_SECTOR_SHIFT;
+
+ for (i = 0; i < ctx->bat.entries; i++) {
+ blk = ctx->bat.bat[i];
+
+ if (blk != DD_BLK_UNUSED) {
+ blk += ctx->spb + ctx->bm_secs;
+ max = MAX(blk, max);
+ }
+ }
+
+ *end = vhd_sectors_to_bytes(max);
+ return 0;
+}
+
+uint32_t inline
+vhd_time(time_t time)
+{
+ return (uint32_t)(time - VHD_EPOCH_START);
+}
+
+/*
+ * Stringify the VHD timestamp for printing.
+ * As with ctime_r, target must be >=26 bytes.
+ */
+size_t
+vhd_time_to_string(uint32_t timestamp, char *target)
+{
+ char *cr;
+ time_t unix_timestamp;
+
+ unix_timestamp = (time_t)timestamp + VHD_EPOCH_START;
+ ctime_r(&unix_timestamp, target);
+
+ /* handle mad ctime_r newline appending. */
+ if ((cr = strchr(target, '\n')) != NULL)
+ *cr = '\0';
+
+ return (strlen(target));
+}
+
+/*
+ * nabbed from vhd specs.
+ */
+uint32_t
+vhd_chs(uint64_t size)
+{
+ uint32_t secs, cylinders, heads, spt, cth;
+
+ secs = secs_round_up_no_zero(size);
+
+ if (secs > 65535 * 16 * 255)
+ secs = 65535 * 16 * 255;
+
+ if (secs >= 65535 * 16 * 63) {
+ spt = 255;
+ cth = secs / spt;
+ heads = 16;
+ } else {
+ spt = 17;
+ cth = secs / spt;
+ heads = (cth + 1023) / 1024;
+
+ if (heads < 4)
+ heads = 4;
+
+ if (cth >= (heads * 1024) || heads > 16) {
+ spt = 31;
+ cth = secs / spt;
+ heads = 16;
+ }
+
+ if (cth >= heads * 1024) {
+ spt = 63;
+ cth = secs / spt;
+ heads = 16;
+ }
+ }
+
+ cylinders = cth / heads;
+
+ return GEOM_ENCODE(cylinders, heads, spt);
+}
+
+int
+vhd_get_footer(vhd_context_t *ctx)
+{
+ if (!vhd_validate_footer(&ctx->footer))
+ return 0;
+
+ return vhd_read_footer(ctx, &ctx->footer);
+}
+
+int
+vhd_get_header(vhd_context_t *ctx)
+{
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ if (!vhd_validate_header(&ctx->header))
+ return 0;
+
+ return vhd_read_header(ctx, &ctx->header);
+}
+
+int
+vhd_get_bat(vhd_context_t *ctx)
+{
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ if (!vhd_validate_bat(&ctx->bat))
+ return 0;
+
+ vhd_put_bat(ctx);
+ return vhd_read_bat(ctx, &ctx->bat);
+}
+
+int
+vhd_get_batmap(vhd_context_t *ctx)
+{
+ if (!vhd_has_batmap(ctx))
+ return -EINVAL;
+
+ if (!vhd_validate_batmap(&ctx->batmap))
+ return 0;
+
+ vhd_put_batmap(ctx);
+ return vhd_read_batmap(ctx, &ctx->batmap);
+}
+
+void
+vhd_put_footer(vhd_context_t *ctx)
+{
+ memset(&ctx->footer, 0, sizeof(vhd_footer_t));
+}
+
+void
+vhd_put_header(vhd_context_t *ctx)
+{
+ memset(&ctx->header, 0, sizeof(vhd_header_t));
+}
+
+void
+vhd_put_bat(vhd_context_t *ctx)
+{
+ if (!vhd_type_dynamic(ctx))
+ return;
+
+ free(ctx->bat.bat);
+ memset(&ctx->bat, 0, sizeof(vhd_bat_t));
+}
+
+void
+vhd_put_batmap(vhd_context_t *ctx)
+{
+ if (!vhd_type_dynamic(ctx))
+ return;
+
+ if (!vhd_has_batmap(ctx))
+ return;
+
+ free(ctx->batmap.map);
+ memset(&ctx->batmap, 0, sizeof(vhd_batmap_t));
+}
+
+/*
+ * look for 511 byte footer at end of file
+ */
+int
+vhd_read_short_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+ int err;
+ char *buf;
+ off_t eof;
+
+ buf = NULL;
+
+ err = vhd_seek(ctx, 0, SEEK_END);
+ if (err)
+ goto out;
+
+ eof = vhd_position(ctx);
+ if (eof == (off_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ err = vhd_seek(ctx, eof - 511, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = posix_memalign((void **)&buf,
+ VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto out;
+ }
+
+ memset(buf, 0, sizeof(vhd_footer_t));
+
+ /*
+ * expecting short read here
+ */
+ vhd_read(ctx, buf, sizeof(vhd_footer_t));
+
+ memcpy(footer, buf, sizeof(vhd_footer_t));
+
+ vhd_footer_in(footer);
+ err = vhd_validate_footer(footer);
+
+out:
+ if (err)
+ VHDLOG("%s: failed reading short footer: %d\n",
+ ctx->file, err);
+ free(buf);
+ return err;
+}
+
+int
+vhd_read_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off_t off)
+{
+ int err;
+ char *buf;
+
+ buf = NULL;
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = posix_memalign((void **)&buf,
+ VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto out;
+ }
+
+ err = vhd_read(ctx, buf, sizeof(vhd_footer_t));
+ if (err)
+ goto out;
+
+ memcpy(footer, buf, sizeof(vhd_footer_t));
+
+ vhd_footer_in(footer);
+ err = vhd_validate_footer(footer);
+
+out:
+ if (err)
+ VHDLOG("%s: reading footer at 0x%08"PRIx64" failed: %d\n",
+ ctx->file, off, err);
+ free(buf);
+ return err;
+}
+
+int
+vhd_read_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+ int err;
+ off_t off;
+
+ err = vhd_seek(ctx, 0, SEEK_END);
+ if (err)
+ return err;
+
+ off = vhd_position(ctx);
+ if (off == (off_t)-1)
+ return -errno;
+
+ err = vhd_read_footer_at(ctx, footer, off - 512);
+ if (err != -EINVAL)
+ return err;
+
+ err = vhd_read_short_footer(ctx, footer);
+ if (err != -EINVAL)
+ return err;
+
+ if (ctx->oflags & VHD_OPEN_STRICT)
+ return -EINVAL;
+
+ return vhd_read_footer_at(ctx, footer, 0);
+}
+
+int
+vhd_read_header_at(vhd_context_t *ctx, vhd_header_t *header, off_t off)
+{
+ int err;
+ char *buf;
+
+ buf = NULL;
+
+ if (!vhd_type_dynamic(ctx)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = posix_memalign((void **)&buf,
+ VHD_SECTOR_SIZE, sizeof(vhd_header_t));
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto out;
+ }
+
+ err = vhd_read(ctx, buf, sizeof(vhd_header_t));
+ if (err)
+ goto out;
+
+ memcpy(header, buf, sizeof(vhd_header_t));
+
+ vhd_header_in(header);
+ err = vhd_validate_header(header);
+
+out:
+ if (err)
+ VHDLOG("%s: reading header at 0x%08"PRIx64" failed: %d\n",
+ ctx->file, off, err);
+ free(buf);
+ return err;
+}
+
+int
+vhd_read_header(vhd_context_t *ctx, vhd_header_t *header)
+{
+ int err;
+ off_t off;
+
+ if (!vhd_type_dynamic(ctx)) {
+ VHDLOG("%s is not dynamic!\n", ctx->file);
+ return -EINVAL;
+ }
+
+ off = ctx->footer.data_offset;
+ return vhd_read_header_at(ctx, header, off);
+}
+
+int
+vhd_read_bat(vhd_context_t *ctx, vhd_bat_t *bat)
+{
+ int err;
+ char *buf;
+ off_t off;
+ size_t size;
+
+ buf = NULL;
+
+ if (!vhd_type_dynamic(ctx)) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ off = ctx->header.table_offset;
+ size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto fail;
+ }
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto fail;
+
+ err = vhd_read(ctx, buf, size);
+ if (err)
+ goto fail;
+
+ bat->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+ bat->entries = ctx->header.max_bat_size;
+ bat->bat = (uint32_t *)buf;
+
+ vhd_bat_in(bat);
+
+ return 0;
+
+fail:
+ free(buf);
+ memset(bat, 0, sizeof(vhd_bat_t));
+ VHDLOG("%s: failed to read bat: %d\n", ctx->file, err);
+ return err;
+}
+
+static int
+vhd_read_batmap_header(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+ int err;
+ char *buf;
+ off_t off;
+ size_t size;
+
+ buf = NULL;
+
+ err = vhd_batmap_header_offset(ctx, &off);
+ if (err)
+ goto fail;
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto fail;
+
+ size = vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto fail;
+ }
+
+ err = vhd_read(ctx, buf, size);
+ if (err)
+ goto fail;
+
+ memcpy(&batmap->header, buf, sizeof(vhd_batmap_header_t));
+ free(buf);
+ buf = NULL;
+
+ vhd_batmap_header_in(batmap);
+
+ return 0;
+
+fail:
+ free(buf);
+ memset(&batmap->header, 0, sizeof(vhd_batmap_header_t));
+ VHDLOG("%s: failed to read batmap header: %d\n", ctx->file, err);
+ return err;
+}
+
+static int
+vhd_read_batmap_map(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+ int err;
+ char *buf;
+ off_t off;
+ size_t map_size;
+
+ map_size = vhd_sectors_to_bytes(batmap->header.batmap_size);
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, map_size);
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto fail;
+ }
+
+ off = batmap->header.batmap_offset;
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto fail;
+
+ err = vhd_read(ctx, buf, map_size);
+ if (err)
+ goto fail;
+
+ batmap->map = buf;
+ return 0;
+
+fail:
+ free(buf);
+ batmap->map = NULL;
+ VHDLOG("%s: failed to read batmap: %d\n", ctx->file, err);
+ return err;
+}
+
+int
+vhd_read_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+ int err;
+
+ if (!vhd_has_batmap(ctx))
+ return -EINVAL;
+
+ memset(batmap, 0, sizeof(vhd_batmap_t));
+
+ err = vhd_read_batmap_header(ctx, batmap);
+ if (err)
+ return err;
+
+ err = vhd_validate_batmap_header(batmap);
+ if (err)
+ return err;
+
+ err = vhd_read_batmap_map(ctx, batmap);
+ if (err)
+ return err;
+
+ err = vhd_validate_batmap(batmap);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ free(batmap->map);
+ memset(batmap, 0, sizeof(vhd_batmap_t));
+ return err;
+}
+
+int
+vhd_has_batmap(vhd_context_t *ctx)
+{
+ if (!vhd_type_dynamic(ctx))
+ return 0;
+
+ if (!vhd_creator_tapdisk(ctx))
+ return 0;
+
+ if (ctx->footer.crtr_ver <= VHD_VERSION(0, 1))
+ return 0;
+
+ if (ctx->footer.crtr_ver >= VHD_VERSION(1, 2))
+ return 1;
+
+ /*
+ * VHDs of version 1.1 probably have a batmap, but may not
+ * if they were updated from version 0.1 via vhd-update.
+ */
+ if (!vhd_validate_batmap_header(&ctx->batmap))
+ return 1;
+
+ if (vhd_read_batmap_header(ctx, &ctx->batmap))
+ return 0;
+
+ return (!vhd_validate_batmap_header(&ctx->batmap));
+}
+
+/*
+ * Is this a block device (with a fixed size)? This affects whether the file
+ * can be truncated and where the footer is written for VHDs.
+ */
+int
+vhd_test_file_fixed(const char *file, int *is_block)
+{
+ int err;
+ struct stat stats;
+
+ err = stat(file, &stats);
+ if (err == -1)
+ return -errno;
+
+ *is_block = !!(S_ISBLK(stats.st_mode));
+ return err;
+}
+
+int
+vhd_find_parent(vhd_context_t *ctx, const char *parent, char **_location)
+{
+ int err;
+ char *location, *cpath, *cdir, *path;
+
+ err = 0;
+ path = NULL;
+ cpath = NULL;
+ location = NULL;
+ *_location = NULL;
+
+ if (!parent)
+ return -EINVAL;
+
+ if (parent[0] == '/') {
+ if (!access(parent, R_OK)) {
+ path = strdup(parent);
+ if (!path)
+ return -ENOMEM;
+ *_location = path;
+ return 0;
+ }
+ }
+
+ /* check parent path relative to child's directory */
+ cpath = realpath(ctx->file, NULL);
+ if (!cpath) {
+ err = -errno;
+ goto out;
+ }
+
+ cdir = dirname(cpath);
+ if (asprintf(&location, "%s/%s", cdir, parent) == -1) {
+ err = -errno;
+ location = NULL;
+ goto out;
+ }
+
+ if (!access(location, R_OK)) {
+ path = realpath(location, NULL);
+ if (path) {
+ *_location = path;
+ return 0;
+ }
+ }
+ err = -errno;
+
+out:
+ free(location);
+ free(cpath);
+ return err;
+}
+
+static int
+vhd_macx_encode_location(char *name, char **out, int *outlen)
+{
+ iconv_t cd;
+ int len, err;
+ size_t ibl, obl;
+ char *uri, *uri_utf8, *uri_utf8p, *ret;
+ const char *urip;
+ char *codeset;
+
+ err = 0;
+ ret = NULL;
+ *out = NULL;
+ *outlen = 0;
+ len = strlen(name) + strlen("file://");
+
+ ibl = len;
+ obl = len * 2;
+
+ urip = uri = malloc(ibl + 1);
+ uri_utf8 = uri_utf8p = malloc(obl);
+
+ if (!uri || !uri_utf8)
+ return -ENOMEM;
+
+ codeset = nl_langinfo(CODESET);
+ cd = iconv_open("UTF-8", codeset);
+ if (cd == (iconv_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ snprintf(uri, ibl+1, "file://%s", name);
+
+ if (iconv(cd,
+#ifdef __linux__
+ (char **)
+#endif
+ &urip, &ibl, &uri_utf8p, &obl) == (size_t)-1 ||
+ ibl) {
+ err = (errno ? -errno : -EIO);
+ goto out;
+ }
+
+ ret = malloc(len);
+ if (!ret) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(ret, uri_utf8, len);
+ *outlen = len;
+ *out = ret;
+
+ out:
+ free(uri);
+ free(uri_utf8);
+ if (cd != (iconv_t)-1)
+ iconv_close(cd);
+
+ return err;
+}
+
+static int
+vhd_w2u_encode_location(char *name, char **out, int *outlen)
+{
+ iconv_t cd;
+ int len, err;
+ size_t ibl, obl;
+ char *uri, *uri_utf16, *uri_utf16p, *tmp, *ret;
+ const char *urip;
+ char *codeset;
+
+ err = 0;
+ ret = NULL;
+ *out = NULL;
+ *outlen = 0;
+ cd = (iconv_t) -1;
+
+ /*
+ * MICROSOFT_COMPAT
+ * relative paths must start with ".\"
+ */
+ if (name[0] != '/') {
+ tmp = strstr(name, "./");
+ if (tmp == name)
+ tmp += strlen("./");
+ else
+ tmp = name;
+
+ err = asprintf(&uri, ".\\%s", tmp);
+ } else
+ err = asprintf(&uri, "%s", name);
+
+ if (err == -1)
+ return -ENOMEM;
+
+ tmp = uri;
+ while (*tmp != '\0') {
+ if (*tmp == '/')
+ *tmp = '\\';
+ tmp++;
+ }
+
+ len = strlen(uri);
+ ibl = len;
+ obl = len * 2;
+ urip = uri;
+
+ uri_utf16 = uri_utf16p = malloc(obl);
+ if (!uri_utf16) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * MICROSOFT_COMPAT
+ * little endian unicode here
+ */
+ codeset = nl_langinfo(CODESET);
+ cd = iconv_open("UTF-16LE", codeset);
+ if (cd == (iconv_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ if (iconv(cd,
+#ifdef __linux__
+ (char **)
+#endif
+ &urip, &ibl, &uri_utf16p, &obl) == (size_t)-1 ||
+ ibl) {
+ err = (errno ? -errno : -EIO);
+ goto out;
+ }
+
+ len = len * 2;
+ ret = malloc(len);
+ if (!ret) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(ret, uri_utf16, len);
+ *outlen = len;
+ *out = ret;
+ err = 0;
+
+ out:
+ free(uri);
+ free(uri_utf16);
+ if (cd != (iconv_t)-1)
+ iconv_close(cd);
+
+ return err;
+}
+
+static char *
+vhd_macx_decode_location(const char *in, char *out, int len)
+{
+ iconv_t cd;
+ char *name;
+ size_t ibl, obl;
+ char *codeset;
+
+ name = out;
+ ibl = obl = len;
+
+ codeset = nl_langinfo(CODESET);
+ cd = iconv_open(codeset, "UTF-8");
+ if (cd == (iconv_t)-1)
+ return NULL;
+
+ if (iconv(cd,
+#ifdef __linux__
+ (char **)
+#endif
+ &in, &ibl, &out, &obl) == (size_t)-1 || ibl)
+ return NULL;
+
+ iconv_close(cd);
+ *out = '\0';
+
+ if (strstr(name, "file://") != name)
+ return NULL;
+
+ name += strlen("file://");
+
+ return strdup(name);
+}
+
+static char *
+vhd_w2u_decode_location(const char *in, char *out, int len, char *utf_type)
+{
+ iconv_t cd;
+ char *name, *tmp;
+ size_t ibl, obl;
+ char *codeset;
+
+ tmp = name = out;
+ ibl = obl = len;
+
+ codeset = nl_langinfo(CODESET);
+ cd = iconv_open(codeset, utf_type);
+ if (cd == (iconv_t)-1)
+ return NULL;
+
+ if (iconv(cd,
+#ifdef __linux__
+ (char **)
+#endif
+ &in, &ibl, &out, &obl) == (size_t)-1 || ibl)
+ return NULL;
+
+ iconv_close(cd);
+ *out = '\0';
+
+ /* TODO: spaces */
+ while (tmp != out) {
+ if (*tmp == '\\')
+ *tmp = '/';
+ tmp++;
+ }
+
+ if (strstr(name, "C:") == name || strstr(name, "c:") == name)
+ name += strlen("c:");
+
+ return strdup(name);
+}
+
+int
+vhd_header_decode_parent(vhd_context_t *ctx, vhd_header_t *header, char **buf)
+{
+ char *code, out[512];
+
+ if (vhd_creator_tapdisk(ctx) &&
+ ctx->footer.crtr_ver == VHD_VERSION(0, 1))
+ code = UTF_16;
+ else
+ code = UTF_16BE;
+
+ *buf = vhd_w2u_decode_location(header->prt_name, out, 512, code);
+ return (*buf == NULL ? -EINVAL : 0);
+}
+
+int
+vhd_parent_locator_read(vhd_context_t *ctx,
+ vhd_parent_locator_t *loc, char **parent)
+{
+ int err, size;
+ char *raw, *out, *name;
+
+ raw = NULL;
+ out = NULL;
+ name = NULL;
+ *parent = NULL;
+
+ if (ctx->footer.type != HD_TYPE_DIFF) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ switch (loc->code) {
+ case PLAT_CODE_MACX:
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_W2RU:
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = vhd_seek(ctx, loc->data_offset, SEEK_SET);
+ if (err)
+ goto out;
+
+ size = vhd_parent_locator_size(loc);
+ if (size <= 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = posix_memalign((void **)&raw, VHD_SECTOR_SIZE, size);
+ if (err) {
+ raw = NULL;
+ err = -err;
+ goto out;
+ }
+
+ err = vhd_read(ctx, raw, size);
+ if (err)
+ goto out;
+
+ out = malloc(loc->data_len + 1);
+ if (!out) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ switch (loc->code) {
+ case PLAT_CODE_MACX:
+ name = vhd_macx_decode_location(raw, out, loc->data_len);
+ break;
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_W2RU:
+ name = vhd_w2u_decode_location(raw, out,
+ loc->data_len, UTF_16LE);
+ break;
+ }
+
+ if (!name) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = 0;
+ *parent = name;
+
+out:
+ free(raw);
+ free(out);
+
+ if (err) {
+ VHDLOG("%s: error reading parent locator: %d\n",
+ ctx->file, err);
+ VHDLOG("%s: locator: code %u, space 0x%x, len 0x%x, "
+ "off 0x%"PRIx64"\n", ctx->file, loc->code, loc->data_space,
+ loc->data_len, loc->data_offset);
+ }
+
+ return err;
+}
+
+int
+vhd_parent_locator_get(vhd_context_t *ctx, char **parent)
+{
+ int i, n, err;
+ char *name, *location;
+ vhd_parent_locator_t *loc;
+
+ err = 0;
+ *parent = NULL;
+
+ if (ctx->footer.type != HD_TYPE_DIFF)
+ return -EINVAL;
+
+ n = vhd_parent_locator_count(ctx);
+ for (i = 0; i < n; i++) {
+ loc = ctx->header.loc + i;
+ err = vhd_parent_locator_read(ctx, loc, &name);
+ if (err)
+ continue;
+
+ err = vhd_find_parent(ctx, name, &location);
+ if (err)
+ VHDLOG("%s: couldn't find parent %s (%d)\n",
+ ctx->file, name, err);
+ free(name);
+
+ if (!err) {
+ *parent = location;
+ return 0;
+ }
+ }
+
+ return err;
+}
+
+int
+vhd_parent_locator_write_at(vhd_context_t *ctx,
+ const char *parent, off_t off, uint32_t code,
+ size_t max_bytes, vhd_parent_locator_t *loc)
+{
+ struct stat stats;
+ int err, len, size;
+ char *absolute_path, *relative_path, *encoded, *block;
+
+ memset(loc, 0, sizeof(vhd_parent_locator_t));
+
+ if (ctx->footer.type != HD_TYPE_DIFF)
+ return -EINVAL;
+
+ absolute_path = NULL;
+ relative_path = NULL;
+ encoded = NULL;
+ block = NULL;
+ size = 0;
+ len = 0;
+
+ switch (code) {
+ case PLAT_CODE_MACX:
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_W2RU:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ absolute_path = realpath(parent, NULL);
+ if (!absolute_path) {
+ err = -errno;
+ goto out;
+ }
+
+ err = stat(absolute_path, &stats);
+ if (err) {
+ err = -errno;
+ goto out;
+ }
+
+ if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ relative_path = relative_path_to(ctx->file, absolute_path, &err);
+ if (!relative_path || err) {
+ err = (err ? err : -EINVAL);
+ goto out;
+ }
+
+ switch (code) {
+ case PLAT_CODE_MACX:
+ err = vhd_macx_encode_location(relative_path, &encoded, &len);
+ break;
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_W2RU:
+ err = vhd_w2u_encode_location(relative_path, &encoded, &len);
+ break;
+ default:
+ err = -EINVAL;
+ }
+
+ if (err)
+ goto out;
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ size = vhd_bytes_padded(len);
+
+ if (max_bytes && size > max_bytes) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+
+ err = posix_memalign((void **)&block, VHD_SECTOR_SIZE, size);
+ if (err) {
+ block = NULL;
+ err = -err;
+ goto out;
+ }
+
+ memset(block, 0, size);
+ memcpy(block, encoded, len);
+
+ err = vhd_write(ctx, block, size);
+ if (err)
+ goto out;
+
+ err = 0;
+
+out:
+ free(absolute_path);
+ free(relative_path);
+ free(encoded);
+ free(block);
+
+ if (!err) {
+ loc->res = 0;
+ loc->code = code;
+ loc->data_len = len;
+ /*
+ * write number of bytes ('size') instead of number of sectors
+ * into loc->data_space to be compatible with MSFT, even though
+ * this goes against the specs
+ */
+ loc->data_space = size;
+ loc->data_offset = off;
+ }
+
+ return err;
+}
+
+static int
+vhd_footer_offset_at_eof(vhd_context_t *ctx, off_t *off)
+{
+ int err;
+ if ((err = vhd_seek(ctx, 0, SEEK_END)))
+ return errno;
+ *off = vhd_position(ctx) - sizeof(vhd_footer_t);
+ return 0;
+}
+
+int
+vhd_read_bitmap(vhd_context_t *ctx, uint32_t block, char **bufp)
+{
+ int err;
+ char *buf;
+ size_t size;
+ off_t off;
+ uint64_t blk;
+
+ buf = NULL;
+ *bufp = NULL;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ if (block >= ctx->bat.entries)
+ return -ERANGE;
+
+ blk = ctx->bat.bat[block];
+ if (blk == DD_BLK_UNUSED)
+ return -EINVAL;
+
+ off = vhd_sectors_to_bytes(blk);
+ size = vhd_bytes_padded(ctx->spb >> 3);
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ return err;
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err)
+ return -err;
+
+ err = vhd_read(ctx, buf, size);
+ if (err)
+ goto fail;
+
+ *bufp = buf;
+ return 0;
+
+fail:
+ free(buf);
+ return err;
+}
+
+int
+vhd_read_block(vhd_context_t *ctx, uint32_t block, char **bufp)
+{
+ int err;
+ char *buf;
+ size_t size;
+ uint64_t blk;
+ off_t end, off;
+
+ buf = NULL;
+ *bufp = NULL;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ if (block >= ctx->bat.entries)
+ return -ERANGE;
+
+ blk = ctx->bat.bat[block];
+ if (blk == DD_BLK_UNUSED)
+ return -EINVAL;
+
+ off = vhd_sectors_to_bytes(blk + ctx->bm_secs);
+ size = vhd_sectors_to_bytes(ctx->spb);
+
+ err = vhd_footer_offset_at_eof(ctx, &end);
+ if (err)
+ return err;
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ err = -err;
+ goto fail;
+ }
+
+ if (end < off + ctx->header.block_size) {
+ size = end - off;
+ memset(buf + size, 0, ctx->header.block_size - size);
+ }
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto fail;
+
+ err = vhd_read(ctx, buf, size);
+ if (err)
+ goto fail;
+
+ *bufp = buf;
+ return 0;
+
+fail:
+ free(buf);
+ return err;
+}
+
+int
+vhd_write_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off_t off)
+{
+ int err;
+ vhd_footer_t *f;
+
+ f = NULL;
+
+ err = posix_memalign((void **)&f,
+ VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+ if (err) {
+ f = NULL;
+ err = -err;
+ goto out;
+ }
+
+ memcpy(f, footer, sizeof(vhd_footer_t));
+ f->checksum = vhd_checksum_footer(f);
+
+ err = vhd_validate_footer(f);
+ if (err)
+ goto out;
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ vhd_footer_out(f);
+
+ err = vhd_write(ctx, f, sizeof(vhd_footer_t));
+
+out:
+ if (err)
+ VHDLOG("%s: failed writing footer at 0x%08"PRIx64": %d\n",
+ ctx->file, off, err);
+ free(f);
+ return err;
+}
+
+int
+vhd_write_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+ int err;
+ off_t off;
+
+ if (ctx->is_block)
+ err = vhd_footer_offset_at_eof(ctx, &off);
+ else
+ err = vhd_end_of_data(ctx, &off);
+ if (err)
+ return err;
+
+ err = vhd_write_footer_at(ctx, footer, off);
+ if (err)
+ return err;
+
+ if (!vhd_type_dynamic(ctx))
+ return 0;
+
+ return vhd_write_footer_at(ctx, footer, 0);
+}
+
+int
+vhd_write_header_at(vhd_context_t *ctx, vhd_header_t *header, off_t off)
+{
+ int err;
+ vhd_header_t *h;
+
+ h = NULL;
+
+ if (!vhd_type_dynamic(ctx)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = posix_memalign((void **)&h,
+ VHD_SECTOR_SIZE, sizeof(vhd_header_t));
+ if (err) {
+ h = NULL;
+ err = -err;
+ goto out;
+ }
+
+ memcpy(h, header, sizeof(vhd_header_t));
+
+ h->checksum = vhd_checksum_header(h);
+ err = vhd_validate_header(h);
+ if (err)
+ goto out;
+
+ vhd_header_out(h);
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = vhd_write(ctx, h, sizeof(vhd_header_t));
+
+out:
+ if (err)
+ VHDLOG("%s: failed writing header at 0x%08"PRIx64": %d\n",
+ ctx->file, off, err);
+ free(h);
+ return err;
+}
+
+int
+vhd_write_header(vhd_context_t *ctx, vhd_header_t *header)
+{
+ int err;
+ off_t off;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ off = ctx->footer.data_offset;
+ return vhd_write_header_at(ctx, header, off);
+}
+
+int
+vhd_write_bat(vhd_context_t *ctx, vhd_bat_t *bat)
+{
+ int err;
+ off_t off;
+ vhd_bat_t b;
+ size_t size;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ err = vhd_validate_bat(&ctx->bat);
+ if (err)
+ return err;
+
+ err = vhd_validate_bat(bat);
+ if (err)
+ return err;
+
+ memset(&b, 0, sizeof(vhd_bat_t));
+
+ off = ctx->header.table_offset;
+ size = vhd_bytes_padded(bat->entries * sizeof(uint32_t));
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ return err;
+
+ err = posix_memalign((void **)&b.bat, VHD_SECTOR_SIZE, size);
+ if (err)
+ return -err;
+
+ memcpy(b.bat, bat->bat, size);
+ b.spb = bat->spb;
+ b.entries = bat->entries;
+ vhd_bat_out(&b);
+
+ err = vhd_write(ctx, b.bat, size);
+ free(b.bat);
+
+ return err;
+}
+
+int
+vhd_write_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+ int err;
+ off_t off;
+ vhd_batmap_t b;
+ char *buf, *map;
+ size_t size, map_size;
+
+ buf = NULL;
+ map = NULL;
+
+ if (!vhd_has_batmap(ctx)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ b.header = batmap->header;
+ b.map = batmap->map;
+
+ b.header.checksum = vhd_checksum_batmap(&b);
+ err = vhd_validate_batmap(&b);
+ if (err)
+ goto out;
+
+ off = b.header.batmap_offset;
+ map_size = vhd_sectors_to_bytes(b.header.batmap_size);
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = posix_memalign((void **)&map, VHD_SECTOR_SIZE, map_size);
+ if (err) {
+ map = NULL;
+ err = -err;
+ goto out;
+ }
+
+ memcpy(map, b.map, map_size);
+
+ err = vhd_write(ctx, map, map_size);
+ if (err)
+ goto out;
+
+ err = vhd_batmap_header_offset(ctx, &off);
+ if (err)
+ goto out;
+
+ size = vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ err = -err;
+ buf = NULL;
+ goto out;
+ }
+
+ vhd_batmap_header_out(&b);
+ memset(buf, 0, size);
+ memcpy(buf, &b.header, sizeof(vhd_batmap_header_t));
+
+ err = vhd_write(ctx, buf, size);
+
+out:
+ if (err)
+ VHDLOG("%s: failed writing batmap: %d\n", ctx->file, err);
+ free(buf);
+ free(map);
+ return 0;
+}
+
+int
+vhd_write_bitmap(vhd_context_t *ctx, uint32_t block, char *bitmap)
+{
+ int err;
+ off_t off;
+ uint64_t blk;
+ size_t secs, size;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ err = vhd_validate_bat(&ctx->bat);
+ if (err)
+ return err;
+
+ if (block >= ctx->bat.entries)
+ return -ERANGE;
+
+ if ((unsigned long)bitmap & (VHD_SECTOR_SIZE - 1))
+ return -EINVAL;
+
+ blk = ctx->bat.bat[block];
+ if (blk == DD_BLK_UNUSED)
+ return -EINVAL;
+
+ off = vhd_sectors_to_bytes(blk);
+ size = vhd_sectors_to_bytes(ctx->bm_secs);
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_write(ctx, bitmap, size);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int
+vhd_write_block(vhd_context_t *ctx, uint32_t block, char *data)
+{
+ int err;
+ off_t off;
+ size_t size;
+ uint64_t blk;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ err = vhd_validate_bat(&ctx->bat);
+ if (err)
+ return err;
+
+ if (block >= ctx->bat.entries)
+ return -ERANGE;
+
+ if ((unsigned long)data & (VHD_SECTOR_SIZE -1))
+ return -EINVAL;
+
+ blk = ctx->bat.bat[block];
+ if (blk == DD_BLK_UNUSED)
+ return -EINVAL;
+
+ off = vhd_sectors_to_bytes(blk + ctx->bm_secs);
+ size = vhd_sectors_to_bytes(ctx->spb);
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_write(ctx, data, size);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static inline int
+namedup(char **dup, const char *name)
+{
+ *dup = NULL;
+
+ if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ *dup = strdup(name);
+ if (*dup == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int
+vhd_seek(vhd_context_t *ctx, off_t offset, int whence)
+{
+ off_t off;
+
+ off = lseek(ctx->fd, offset, whence);
+ if (off == (off_t)-1) {
+ VHDLOG("%s: seek(0x%08"PRIx64", %d) failed: %d\n",
+ ctx->file, offset, whence, -errno);
+ return -errno;
+ }
+
+ return 0;
+}
+
+off_t
+vhd_position(vhd_context_t *ctx)
+{
+ return lseek(ctx->fd, 0, SEEK_CUR);
+}
+
+int
+vhd_read(vhd_context_t *ctx, void *buf, size_t size)
+{
+ size_t ret;
+
+ errno = 0;
+
+ ret = read(ctx->fd, buf, size);
+ if (ret == size)
+ return 0;
+
+ VHDLOG("%s: read of %zu returned %zd, errno: %d\n",
+ ctx->file, size, ret, -errno);
+
+ return (errno ? -errno : -EIO);
+}
+
+int
+vhd_write(vhd_context_t *ctx, void *buf, size_t size)
+{
+ size_t ret;
+
+ errno = 0;
+
+ ret = write(ctx->fd, buf, size);
+ if (ret == size)
+ return 0;
+
+ VHDLOG("%s: write of %zu returned %zd, errno: %d\n",
+ ctx->file, size, ret, -errno);
+
+ return (errno ? -errno : -EIO);
+}
+
+int
+vhd_offset(vhd_context_t *ctx, uint32_t sector, uint32_t *offset)
+{
+ int err;
+ uint32_t block;
+
+ if (!vhd_type_dynamic(ctx))
+ return sector;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ block = sector / ctx->spb;
+ if (ctx->bat.bat[block] == DD_BLK_UNUSED)
+ *offset = DD_BLK_UNUSED;
+ else
+ *offset = ctx->bat.bat[block] +
+ ctx->bm_secs + (sector % ctx->spb);
+
+ return 0;
+}
+
+int
+vhd_open_fast(vhd_context_t *ctx)
+{
+ int err;
+ char *buf;
+ size_t size;
+
+ size = sizeof(vhd_footer_t) + sizeof(vhd_header_t);
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ VHDLOG("failed allocating %s: %d\n", ctx->file, -err);
+ return -err;
+ }
+
+ err = vhd_read(ctx, buf, size);
+ if (err) {
+ VHDLOG("failed reading %s: %d\n", ctx->file, err);
+ goto out;
+ }
+
+ memcpy(&ctx->footer, buf, sizeof(vhd_footer_t));
+ vhd_footer_in(&ctx->footer);
+ err = vhd_validate_footer(&ctx->footer);
+ if (err)
+ goto out;
+
+ if (vhd_type_dynamic(ctx)) {
+ if (ctx->footer.data_offset != sizeof(vhd_footer_t))
+ err = vhd_read_header(ctx, &ctx->header);
+ else {
+ memcpy(&ctx->header,
+ buf + sizeof(vhd_footer_t),
+ sizeof(vhd_header_t));
+ vhd_header_in(&ctx->header);
+ err = vhd_validate_header(&ctx->header);
+ }
+
+ if (err)
+ goto out;
+
+ ctx->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+ ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3);
+ }
+
+out:
+ free(buf);
+ return err;
+}
+
+int
+vhd_open(vhd_context_t *ctx, const char *file, int flags)
+{
+ int err, oflags;
+
+ if (flags & VHD_OPEN_STRICT)
+ vhd_flag_clear(flags, VHD_OPEN_FAST);
+
+ memset(ctx, 0, sizeof(vhd_context_t));
+ ctx->fd = -1;
+ ctx->oflags = flags;
+
+ err = namedup(&ctx->file, file);
+ if (err)
+ return err;
+
+ oflags = O_DIRECT | O_LARGEFILE;
+ if (flags & VHD_OPEN_RDONLY)
+ oflags |= O_RDONLY;
+ if (flags & VHD_OPEN_RDWR)
+ oflags |= O_RDWR;
+
+ ctx->fd = open(ctx->file, oflags, 0644);
+ if (ctx->fd == -1) {
+ err = -errno;
+ VHDLOG("failed to open %s: %d\n", ctx->file, err);
+ goto fail;
+ }
+
+ err = vhd_test_file_fixed(ctx->file, &ctx->is_block);
+ if (err)
+ goto fail;
+
+ if (flags & VHD_OPEN_FAST) {
+ err = vhd_open_fast(ctx);
+ if (err)
+ goto fail;
+
+ return 0;
+ }
+
+ err = vhd_read_footer(ctx, &ctx->footer);
+ if (err)
+ goto fail;
+
+ if (!(flags & VHD_OPEN_IGNORE_DISABLED) && vhd_disabled(ctx)) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ if (vhd_type_dynamic(ctx)) {
+ err = vhd_read_header(ctx, &ctx->header);
+ if (err)
+ goto fail;
+
+ ctx->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+ ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3);
+ }
+
+ return 0;
+
+fail:
+ if (ctx->fd != -1)
+ close(ctx->fd);
+ free(ctx->file);
+ memset(ctx, 0, sizeof(vhd_context_t));
+ return err;
+}
+
+void
+vhd_close(vhd_context_t *ctx)
+{
+ if (ctx->file)
+ close(ctx->fd);
+ free(ctx->file);
+ free(ctx->bat.bat);
+ free(ctx->batmap.map);
+ memset(ctx, 0, sizeof(vhd_context_t));
+}
+
+static inline void
+vhd_initialize_footer(vhd_context_t *ctx, int type, uint64_t size)
+{
+ memset(&ctx->footer, 0, sizeof(vhd_footer_t));
+ memcpy(ctx->footer.cookie, HD_COOKIE, sizeof(ctx->footer.cookie));
+ ctx->footer.features = HD_RESERVED;
+ ctx->footer.ff_version = HD_FF_VERSION;
+ ctx->footer.timestamp = vhd_time(time(NULL));
+ ctx->footer.crtr_ver = VHD_CURRENT_VERSION;
+ ctx->footer.crtr_os = 0x00000000;
+ ctx->footer.orig_size = size;
+ ctx->footer.curr_size = size;
+ ctx->footer.geometry = vhd_chs(size);
+ ctx->footer.type = type;
+ ctx->footer.saved = 0;
+ ctx->footer.data_offset = 0xFFFFFFFFFFFFFFFF;
+ strcpy(ctx->footer.crtr_app, "tap");
+ vhd_uuid_generate(&ctx->footer.uuid);
+}
+
+static int
+vhd_initialize_header_parent_name(vhd_context_t *ctx, const char *parent_path)
+{
+ int err;
+ iconv_t cd;
+ size_t ibl, obl;
+ char *ppath, *dst;
+ const char *pname;
+ char *codeset;
+
+ err = 0;
+ pname = NULL;
+ ppath = NULL;
+
+ /*
+ * MICROSOFT_COMPAT
+ * big endian unicode here
+ */
+ codeset = nl_langinfo(CODESET);
+ cd = iconv_open(UTF_16BE, codeset);
+ if (cd == (iconv_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ ppath = strdup(parent_path);
+ if (!ppath) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ pname = basename(ppath);
+ if (!strcmp(pname, "")) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ibl = strlen(pname);
+ obl = sizeof(ctx->header.prt_name);
+ dst = ctx->header.prt_name;
+
+ memset(dst, 0, obl);
+
+ if (iconv(cd,
+#ifdef __linux__
+ (char **)
+#endif
+ &pname, &ibl, &dst, &obl) == (size_t)-1 || ibl)
+ err = (errno ? -errno : -EINVAL);
+
+out:
+ iconv_close(cd);
+ free(ppath);
+ return err;
+}
+
+static off_t
+get_file_size(const char *name)
+{
+ int fd;
+ off_t end;
+
+ fd = open(name, O_LARGEFILE | O_RDONLY);
+ if (fd == -1) {
+ VHDLOG("unable to open '%s': %d\n", name, errno);
+ return -errno;
+ }
+ end = lseek(fd, 0, SEEK_END);
+ close(fd);
+ return end;
+}
+
+static int
+vhd_initialize_header(vhd_context_t *ctx, const char *parent_path,
+ uint64_t size, int raw)
+{
+ int err;
+ struct stat stats;
+ vhd_context_t parent;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ memset(&ctx->header, 0, sizeof(vhd_header_t));
+ memcpy(ctx->header.cookie, DD_COOKIE, sizeof(ctx->header.cookie));
+ ctx->header.data_offset = (uint64_t)-1;
+ ctx->header.table_offset = VHD_SECTOR_SIZE * 3; /* 1 ftr + 2 hdr */
+ ctx->header.hdr_ver = DD_VERSION;
+ ctx->header.block_size = VHD_BLOCK_SIZE;
+ ctx->header.prt_ts = 0;
+ ctx->header.res1 = 0;
+ ctx->header.max_bat_size = (ctx->footer.curr_size +
+ VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+
+ ctx->footer.data_offset = VHD_SECTOR_SIZE;
+
+ if (ctx->footer.type == HD_TYPE_DYNAMIC)
+ return 0;
+
+ err = stat(parent_path, &stats);
+ if (err == -1)
+ return -errno;
+
+ if (raw) {
+ ctx->header.prt_ts = vhd_time(stats.st_mtime);
+ if (!size)
+ size = get_file_size(parent_path);
+ }
+ else {
+ err = vhd_open(&parent, parent_path, VHD_OPEN_RDONLY);
+ if (err)
+ return err;
+
+ ctx->header.prt_ts = vhd_time(stats.st_mtime);
+ vhd_uuid_copy(&ctx->header.prt_uuid, &parent.footer.uuid);
+ if (!size)
+ size = parent.footer.curr_size;
+ vhd_close(&parent);
+ }
+ ctx->footer.orig_size = size;
+ ctx->footer.curr_size = size;
+ ctx->footer.geometry = vhd_chs(size);
+ ctx->header.max_bat_size =
+ (size + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+
+ return vhd_initialize_header_parent_name(ctx, parent_path);
+}
+
+static int
+vhd_write_parent_locators(vhd_context_t *ctx, const char *parent)
+{
+ int i, err;
+ off_t off;
+ uint32_t code;
+
+ code = PLAT_CODE_NONE;
+
+ if (ctx->footer.type != HD_TYPE_DIFF)
+ return -EINVAL;
+
+ off = ctx->batmap.header.batmap_offset +
+ vhd_sectors_to_bytes(ctx->batmap.header.batmap_size);
+ if (off & (VHD_SECTOR_SIZE - 1))
+ off = vhd_bytes_padded(off);
+
+ for (i = 0; i < 3; i++) {
+ switch (i) {
+ case 0:
+ code = PLAT_CODE_MACX;
+ break;
+ case 1:
+ code = PLAT_CODE_W2KU;
+ break;
+ case 2:
+ code = PLAT_CODE_W2RU;
+ break;
+ }
+
+ err = vhd_parent_locator_write_at(ctx, parent, off, code,
+ 0, ctx->header.loc + i);
+ if (err)
+ return err;
+
+ off += vhd_parent_locator_size(ctx->header.loc + i);
+ }
+
+ return 0;
+}
+
+int
+vhd_change_parent(vhd_context_t *child, char *parent_path, int raw)
+{
+ int i, err;
+ char *ppath;
+ struct stat stats;
+ vhd_context_t parent;
+
+ ppath = realpath(parent_path, NULL);
+ if (!ppath) {
+ VHDLOG("error resolving parent path %s for %s: %d\n",
+ parent_path, child->file, errno);
+ return -errno;
+ }
+
+ err = stat(ppath, &stats);
+ if (err == -1) {
+ err = -errno;
+ goto out;
+ }
+
+ if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (raw) {
+ vhd_uuid_clear(&child->header.prt_uuid);
+ } else {
+ err = vhd_open(&parent, ppath, VHD_OPEN_RDONLY);
+ if (err) {
+ VHDLOG("error opening parent %s for %s: %d\n",
+ ppath, child->file, err);
+ goto out;
+ }
+ vhd_uuid_copy(&child->header.prt_uuid, &parent.footer.uuid);
+ vhd_close(&parent);
+ }
+
+ vhd_initialize_header_parent_name(child, ppath);
+ child->header.prt_ts = vhd_time(stats.st_mtime);
+
+ for (i = 0; i < vhd_parent_locator_count(child); i++) {
+ vhd_parent_locator_t *loc = child->header.loc + i;
+ size_t max = vhd_parent_locator_size(loc);
+
+ switch (loc->code) {
+ case PLAT_CODE_MACX:
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_W2RU:
+ break;
+ default:
+ continue;
+ }
+
+ err = vhd_parent_locator_write_at(child, ppath,
+ loc->data_offset,
+ loc->code, max, loc);
+ if (err) {
+ VHDLOG("error writing parent locator %d for %s: %d\n",
+ i, child->file, err);
+ goto out;
+ }
+ }
+
+ TEST_FAIL_AT(FAIL_REPARENT_LOCATOR);
+
+ err = vhd_write_header(child, &child->header);
+ if (err) {
+ VHDLOG("error writing header for %s: %d\n", child->file, err);
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ free(ppath);
+ return err;
+}
+
+static int
+vhd_create_batmap(vhd_context_t *ctx)
+{
+ off_t off;
+ int err, map_bytes;
+ vhd_batmap_header_t *header;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ map_bytes = (ctx->header.max_bat_size + 7) >> 3;
+ header = &ctx->batmap.header;
+
+ memset(header, 0, sizeof(vhd_batmap_header_t));
+ memcpy(header->cookie, VHD_BATMAP_COOKIE, sizeof(header->cookie));
+
+ err = vhd_batmap_header_offset(ctx, &off);
+ if (err)
+ return err;
+
+ header->batmap_offset = off +
+ vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+ header->batmap_size = secs_round_up_no_zero(map_bytes);
+ header->batmap_version = VHD_BATMAP_CURRENT_VERSION;
+
+ map_bytes = vhd_sectors_to_bytes(header->batmap_size);
+
+ err = posix_memalign((void **)&ctx->batmap.map,
+ VHD_SECTOR_SIZE, map_bytes);
+ if (err) {
+ ctx->batmap.map = NULL;
+ return -err;
+ }
+
+ memset(ctx->batmap.map, 0, map_bytes);
+
+ return vhd_write_batmap(ctx, &ctx->batmap);
+}
+
+static int
+vhd_create_bat(vhd_context_t *ctx)
+{
+ int i, err;
+ size_t size;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+ err = posix_memalign((void **)&ctx->bat.bat, VHD_SECTOR_SIZE, size);
+ if (err) {
+ ctx->bat.bat = NULL;
+ return err;
+ }
+
+ memset(ctx->bat.bat, 0, size);
+ for (i = 0; i < ctx->header.max_bat_size; i++)
+ ctx->bat.bat[i] = DD_BLK_UNUSED;
+
+ err = vhd_seek(ctx, ctx->header.table_offset, SEEK_SET);
+ if (err)
+ return err;
+
+ ctx->bat.entries = ctx->header.max_bat_size;
+ ctx->bat.spb = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+
+ return vhd_write_bat(ctx, &ctx->bat);
+}
+
+static int
+vhd_initialize_fixed_disk(vhd_context_t *ctx)
+{
+ char *buf;
+ int i, err;
+
+ if (ctx->footer.type != HD_TYPE_FIXED)
+ return -EINVAL;
+
+ err = vhd_seek(ctx, 0, SEEK_SET);
+ if (err)
+ return err;
+
+ buf = mmap(0, VHD_BLOCK_SIZE, PROT_READ,
+ MAP_SHARED | MAP_ANON, -1, 0);
+ if (buf == MAP_FAILED)
+ return -errno;
+
+ for (i = 0; i < ctx->footer.curr_size >> VHD_BLOCK_SHIFT; i++) {
+ err = vhd_write(ctx, buf, VHD_BLOCK_SIZE);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ munmap(buf, VHD_BLOCK_SIZE);
+ return err;
+}
+
+int
+vhd_get_phys_size(vhd_context_t *ctx, off_t *size)
+{
+ int err;
+
+ if ((err = vhd_end_of_data(ctx, size)))
+ return err;
+ *size += sizeof(vhd_footer_t);
+ return 0;
+}
+
+int
+vhd_set_phys_size(vhd_context_t *ctx, off_t size)
+{
+ off_t phys_size;
+ int err;
+
+ err = vhd_get_phys_size(ctx, &phys_size);
+ if (err)
+ return err;
+ if (size < phys_size) {
+ // would result in data loss
+ VHDLOG("ERROR: new size (%"PRIu64") < phys size (%"PRIu64")\n",
+ size, phys_size);
+ return -EINVAL;
+ }
+ return vhd_write_footer_at(ctx, &ctx->footer,
+ size - sizeof(vhd_footer_t));
+}
+
+static int
+__vhd_create(const char *name, const char *parent, uint64_t bytes, int type,
+ vhd_flag_creat_t flags)
+{
+ int err;
+ off_t off;
+ vhd_context_t ctx;
+ vhd_footer_t *footer;
+ vhd_header_t *header;
+ uint64_t size, blks;
+
+ switch (type) {
+ case HD_TYPE_DIFF:
+ if (!parent)
+ return -EINVAL;
+ case HD_TYPE_FIXED:
+ case HD_TYPE_DYNAMIC:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (strnlen(name, VHD_MAX_NAME_LEN - 1) == VHD_MAX_NAME_LEN - 1)
+ return -ENAMETOOLONG;
+
+ memset(&ctx, 0, sizeof(vhd_context_t));
+ footer = &ctx.footer;
+ header = &ctx.header;
+ blks = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+ size = blks << VHD_BLOCK_SHIFT;
+
+ ctx.fd = open(name, O_WRONLY | O_CREAT |
+ O_TRUNC | O_LARGEFILE | O_DIRECT, 0644);
+ if (ctx.fd == -1)
+ return -errno;
+
+ ctx.file = strdup(name);
+ if (!ctx.file) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = vhd_test_file_fixed(ctx.file, &ctx.is_block);
+ if (err)
+ goto out;
+
+ vhd_initialize_footer(&ctx, type, size);
+
+ if (type == HD_TYPE_FIXED) {
+ err = vhd_initialize_fixed_disk(&ctx);
+ if (err)
+ goto out;
+ } else {
+ int raw = vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW);
+ err = vhd_initialize_header(&ctx, parent, size, raw);
+ if (err)
+ goto out;
+
+ err = vhd_write_footer_at(&ctx, &ctx.footer, 0);
+ if (err)
+ goto out;
+
+ err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE);
+ if (err)
+ goto out;
+
+ err = vhd_create_batmap(&ctx);
+ if (err)
+ goto out;
+
+ err = vhd_create_bat(&ctx);
+ if (err)
+ goto out;
+
+ if (type == HD_TYPE_DIFF) {
+ err = vhd_write_parent_locators(&ctx, parent);
+ if (err)
+ goto out;
+ }
+
+ /* write header again since it may have changed */
+ err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE);
+ if (err)
+ goto out;
+ }
+
+ err = vhd_seek(&ctx, 0, SEEK_END);
+ if (err)
+ goto out;
+
+ off = vhd_position(&ctx);
+ if (off == (off_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ if (ctx.is_block)
+ off -= sizeof(vhd_footer_t);
+
+ err = vhd_write_footer_at(&ctx, &ctx.footer, off);
+ if (err)
+ goto out;
+
+ err = 0;
+
+out:
+ vhd_close(&ctx);
+ if (err && !ctx.is_block)
+ unlink(name);
+ return err;
+}
+
+int
+vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t flags)
+{
+ return __vhd_create(name, NULL, bytes, type, flags);
+}
+
+int
+vhd_snapshot(const char *name, uint64_t bytes, const char *parent,
+ vhd_flag_creat_t flags)
+{
+ return __vhd_create(name, parent, bytes, HD_TYPE_DIFF, flags);
+}
+
+static int
+__vhd_io_fixed_read(vhd_context_t *ctx,
+ char *buf, uint64_t sec, uint32_t secs)
+{
+ int err;
+
+ err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET);
+ if (err)
+ return err;
+
+ return vhd_read(ctx, buf, vhd_sectors_to_bytes(secs));
+}
+
+static void
+__vhd_io_dynamic_copy_data(vhd_context_t *ctx,
+ char *map, int map_off,
+ char *bitmap, int bitmap_off,
+ char *dst, char *src, int secs)
+{
+ int i;
+
+ for (i = 0; i < secs; i++) {
+ if (test_bit(map, map_off + i))
+ goto next;
+
+ if (ctx && !vhd_bitmap_test(ctx, bitmap, bitmap_off + i))
+ goto next;
+
+ memcpy(dst, src, VHD_SECTOR_SIZE);
+ set_bit(map, map_off + i);
+
+ next:
+ src += VHD_SECTOR_SIZE;
+ dst += VHD_SECTOR_SIZE;
+ }
+}
+
+static int
+__vhd_io_dynamic_read_link(vhd_context_t *ctx, char *map,
+ char *buf, uint64_t sector, uint32_t secs)
+{
+ off_t off;
+ uint32_t blk, sec;
+ int err, cnt, map_off;
+ char *bitmap, *data, *src;
+
+ map_off = 0;
+
+ do {
+ blk = sector / ctx->spb;
+ sec = sector % ctx->spb;
+ off = ctx->bat.bat[blk];
+ data = NULL;
+ bitmap = NULL;
+
+ if (off == DD_BLK_UNUSED) {
+ cnt = MIN(secs, ctx->spb);
+ goto next;
+ }
+
+ err = vhd_read_bitmap(ctx, blk, &bitmap);
+ if (err)
+ return err;
+
+ err = vhd_read_block(ctx, blk, &data);
+ if (err) {
+ free(bitmap);
+ return err;
+ }
+
+ cnt = MIN(secs, ctx->spb - sec);
+ src = data + vhd_sectors_to_bytes(sec);
+
+ __vhd_io_dynamic_copy_data(ctx,
+ map, map_off,
+ bitmap, sec,
+ buf, src, cnt);
+
+ next:
+ free(data);
+ free(bitmap);
+
+ secs -= cnt;
+ sector += cnt;
+ map_off += cnt;
+ buf += vhd_sectors_to_bytes(cnt);
+
+ } while (secs);
+
+ return 0;
+}
+
+static int
+__raw_read_link(char *filename,
+ char *map, char *buf, uint64_t sec, uint32_t secs)
+{
+ int fd, err;
+ off_t off;
+ uint64_t size;
+ char *data;
+
+ err = 0;
+ errno = 0;
+ fd = open(filename, O_RDONLY | O_DIRECT | O_LARGEFILE);
+ if (fd == -1) {
+ VHDLOG("%s: failed to open: %d\n", filename, -errno);
+ return -errno;
+ }
+
+ off = lseek(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
+ if (off == (off_t)-1) {
+ VHDLOG("%s: seek(0x%08"PRIx64") failed: %d\n",
+ filename, vhd_sectors_to_bytes(sec), -errno);
+ err = -errno;
+ goto close;
+ }
+
+ size = vhd_sectors_to_bytes(secs);
+ err = posix_memalign((void **)&data, VHD_SECTOR_SIZE, size);
+ if (err)
+ goto close;
+
+ err = read(fd, data, size);
+ if (err != size) {
+ VHDLOG("%s: reading of %"PRIu64" returned %d, errno: %d\n",
+ filename, size, err, -errno);
+ free(data);
+ err = errno ? -errno : -EIO;
+ goto close;
+ }
+ __vhd_io_dynamic_copy_data(NULL, map, 0, NULL, 0, buf, data, secs);
+ free(data);
+ err = 0;
+
+close:
+ close(fd);
+ return err;
+}
+
+static int
+__vhd_io_dynamic_read(vhd_context_t *ctx,
+ char *buf, uint64_t sec, uint32_t secs)
+{
+ int err;
+ uint32_t i, done;
+ char *map, *next;
+ vhd_context_t parent, *vhd;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ vhd = ctx;
+ next = NULL;
+ map = calloc(1, secs << (VHD_SECTOR_SHIFT - 3));
+ if (!map)
+ return -ENOMEM;
+
+ memset(buf, 0, vhd_sectors_to_bytes(secs));
+
+ for (;;) {
+ err = __vhd_io_dynamic_read_link(vhd, map, buf, sec, secs);
+ if (err)
+ goto close;
+
+ for (done = 0, i = 0; i < secs; i++)
+ if (test_bit(map, i))
+ done++;
+
+ if (done == secs) {
+ err = 0;
+ goto close;
+ }
+
+ if (vhd->footer.type == HD_TYPE_DIFF) {
+ err = vhd_parent_locator_get(vhd, &next);
+ if (err)
+ goto close;
+ if (vhd_parent_raw(vhd)) {
+ err = __raw_read_link(next, map, buf, sec,
+ secs);
+ goto close;
+ }
+ } else {
+ err = 0;
+ goto close;
+ }
+
+ if (vhd != ctx)
+ vhd_close(vhd);
+ vhd = &parent;
+
+ err = vhd_open(vhd, next, VHD_OPEN_RDONLY);
+ if (err)
+ goto out;
+
+ err = vhd_get_bat(vhd);
+ if (err)
+ goto close;
+
+ free(next);
+ next = NULL;
+ }
+
+close:
+ if (vhd != ctx)
+ vhd_close(vhd);
+out:
+ free(map);
+ free(next);
+ return err;
+}
+
+int
+vhd_io_read(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs)
+{
+ if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size)
+ return -ERANGE;
+
+ if (!vhd_type_dynamic(ctx))
+ return __vhd_io_fixed_read(ctx, buf, sec, secs);
+
+ return __vhd_io_dynamic_read(ctx, buf, sec, secs);
+}
+
+static int
+__vhd_io_fixed_write(vhd_context_t *ctx,
+ char *buf, uint64_t sec, uint32_t secs)
+{
+ int err;
+
+ err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET);
+ if (err)
+ return err;
+
+ return vhd_write(ctx, buf, vhd_sectors_to_bytes(secs));
+}
+
+static int
+__vhd_io_allocate_block(vhd_context_t *ctx, uint32_t block)
+{
+ char *buf;
+ size_t size;
+ off_t off, max;
+ int i, err, gap, spp;
+
+ spp = getpagesize() >> VHD_SECTOR_SHIFT;
+
+ err = vhd_end_of_data(ctx, &max);
+ if (err)
+ return err;
+
+ gap = 0;
+ off = max;
+ max >>= VHD_SECTOR_SHIFT;
+
+ /* data region of segment should begin on page boundary */
+ if ((max + ctx->bm_secs) % spp) {
+ gap = (spp - ((max + ctx->bm_secs) % spp));
+ max += gap;
+ }
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ return err;
+
+ size = vhd_sectors_to_bytes(ctx->spb + ctx->bm_secs + gap);
+ buf = mmap(0, size, PROT_READ, MAP_SHARED | MAP_ANON, -1, 0);
+ if (buf == MAP_FAILED)
+ return -errno;
+
+ err = vhd_write(ctx, buf, size);
+ if (err)
+ goto out;
+
+ ctx->bat.bat[block] = max;
+ err = vhd_write_bat(ctx, &ctx->bat);
+ if (err)
+ goto out;
+
+ err = 0;
+
+out:
+ munmap(buf, size);
+ return err;
+}
+
+static int
+__vhd_io_dynamic_write(vhd_context_t *ctx,
+ char *buf, uint64_t sector, uint32_t secs)
+{
+ char *map;
+ off_t off;
+ uint32_t blk, sec;
+ int i, err, cnt, ret;
+
+ if (vhd_sectors_to_bytes(sector + secs) > ctx->footer.curr_size)
+ return -ERANGE;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ if (vhd_has_batmap(ctx)) {
+ err = vhd_get_batmap(ctx);
+ if (err)
+ return err;
+ }
+
+ do {
+ blk = sector / ctx->spb;
+ sec = sector % ctx->spb;
+
+ off = ctx->bat.bat[blk];
+ if (off == DD_BLK_UNUSED) {
+ err = __vhd_io_allocate_block(ctx, blk);
+ if (err)
+ return err;
+
+ off = ctx->bat.bat[blk];
+ }
+
+ off += ctx->bm_secs + sec;
+ err = vhd_seek(ctx, vhd_sectors_to_bytes(off), SEEK_SET);
+ if (err)
+ return err;
+
+ cnt = MIN(secs, ctx->spb - sec);
+ err = vhd_write(ctx, buf, vhd_sectors_to_bytes(cnt));
+ if (err)
+ return err;
+
+ if (vhd_has_batmap(ctx) &&
+ vhd_batmap_test(ctx, &ctx->batmap, blk))
+ goto next;
+
+ err = vhd_read_bitmap(ctx, blk, &map);
+ if (err)
+ return err;
+
+ for (i = 0; i < cnt; i++)
+ vhd_bitmap_set(ctx, map, sec + i);
+
+ err = vhd_write_bitmap(ctx, blk, map);
+ if (err)
+ goto fail;
+
+ if (vhd_has_batmap(ctx)) {
+ for (i = 0; i < ctx->spb; i++)
+ if (!vhd_bitmap_test(ctx, map, i)) {
+ free(map);
+ goto next;
+ }
+
+ vhd_batmap_set(ctx, &ctx->batmap, blk);
+ err = vhd_write_batmap(ctx, &ctx->batmap);
+ if (err)
+ goto fail;
+ }
+
+ free(map);
+ map = NULL;
+
+ next:
+ secs -= cnt;
+ sector += cnt;
+ buf += vhd_sectors_to_bytes(cnt);
+ } while (secs);
+
+ err = 0;
+
+out:
+ ret = vhd_write_footer(ctx, &ctx->footer);
+ return (err ? err : ret);
+
+fail:
+ free(map);
+ goto out;
+}
+
+int
+vhd_io_write(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs)
+{
+ if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size)
+ return -ERANGE;
+
+ if (!vhd_type_dynamic(ctx))
+ return __vhd_io_fixed_write(ctx, buf, sec, secs);
+
+ return __vhd_io_dynamic_write(ctx, buf, sec, secs);
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "relative-path.h"
+
+#define sfree(ptr) \
+do { \
+ free(ptr); \
+ ptr = NULL; \
+} while (0)
+
+/*
+ * count number of tokens between DELIMETER characters
+ */
+static int
+count_nodes(char *path)
+{
+ int i;
+ char *tmp;
+
+ if (!path)
+ return 0;
+
+ for (i = 0, tmp = path; *tmp != '\0'; tmp++)
+ if (*tmp == DELIMITER)
+ i++;
+
+ return i;
+}
+
+/*
+ * return copy of next node in @path, or NULL
+ * @path is moved to the end of the next node
+ * @err is set to -errno on failure
+ * copy should be freed
+ */
+static char *
+next_node(char **path, int *err)
+{
+ int ret;
+ char *tmp, *start;
+
+ if (!path || !*path) {
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ *err = 0;
+ start = *path;
+
+ for (tmp = *path; *tmp != '\0'; tmp++)
+ if (*tmp == DELIMITER) {
+ int size;
+ char *node;
+
+ size = tmp - start + 1;
+ node = malloc(size);
+ if (!node) {
+ *err = -ENOMEM;
+ return NULL;
+ }
+
+ ret = snprintf(node, size, "%s", start);
+ if (ret < 0) {
+ free(node);
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ *path = tmp;
+ return node;
+ }
+
+ return NULL;
+}
+
+/*
+ * count number of nodes in common betwee @to and @from
+ * returns number of common nodes, or -errno on failure
+ */
+static int
+count_common_nodes(char *to, char *from)
+{
+ int err, common;
+ char *to_node, *from_node;
+
+ if (!to || !from)
+ return -EINVAL;
+
+ err = 0;
+ common = 0;
+ to_node = NULL;
+ from_node = NULL;
+
+ do {
+ to_node = next_node(&to, &err);
+ if (err || !to_node)
+ break;
+
+ from_node = next_node(&from, &err);
+ if (err || !from_node)
+ break;
+
+ if (strncmp(to_node, from_node, MAX_NAME_LEN))
+ break;
+
+ ++to;
+ ++from;
+ ++common;
+ sfree(to_node);
+ sfree(from_node);
+
+ } while (1);
+
+ sfree(to_node);
+ sfree(from_node);
+
+ if (err)
+ return err;
+
+ return common;
+}
+
+/*
+ * construct path of @count '../', './' if @count is zero, or NULL on error
+ * result should be freed
+ */
+static char *
+up_nodes(int count)
+{
+ char *path, *tmp;
+ int i, ret, len, size;
+
+ if (!count)
+ return strdup("./");
+
+ len = strlen("../");
+ size = len * count;
+ if (size >= MAX_NAME_LEN)
+ return NULL;
+
+ path = malloc(size + 1);
+ if (!path)
+ return NULL;
+
+ tmp = path;
+ for (i = 0; i < count; i++) {
+ ret = sprintf(tmp, "../");
+ if (ret < 0 || ret != len) {
+ free(path);
+ return NULL;
+ }
+ tmp += ret;
+ }
+
+ return path;
+}
+
+/*
+ * return pointer to @offset'th node of path or NULL on error
+ */
+static char *
+node_offset(char *from, int offset)
+{
+ char *path;
+
+ if (!from || !offset)
+ return NULL;
+
+ for (path = from; *path != '\0'; path++) {
+ if (*path == DELIMITER)
+ if (--offset == 0)
+ return path + 1;
+ }
+
+ return NULL;
+}
+
+/*
+ * return a relative path from @from to @to
+ * result should be freed
+ */
+char *
+relative_path_to(char *from, char *to, int *err)
+{
+ int from_nodes, common;
+ char *to_absolute, *from_absolute;
+ char *up, *common_target_path, *relative_path;
+
+ *err = 0;
+ up = NULL;
+ to_absolute = NULL;
+ from_absolute = NULL;
+ relative_path = NULL;
+
+ if (strnlen(to, MAX_NAME_LEN) == MAX_NAME_LEN ||
+ strnlen(from, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ EPRINTF("invalid input; max path length is %d\n",
+ MAX_NAME_LEN);
+ *err = -ENAMETOOLONG;
+ return NULL;
+ }
+
+ to_absolute = realpath(to, NULL);
+ if (!to_absolute) {
+ EPRINTF("failed to get absolute path of %s\n", to);
+ *err = -errno;
+ goto out;
+ }
+
+ from_absolute = realpath(from, NULL);
+ if (!from_absolute) {
+ EPRINTF("failed to get absolute path of %s\n", from);
+ *err = -errno;
+ goto out;
+ }
+
+ if (strnlen(to_absolute, MAX_NAME_LEN) == MAX_NAME_LEN ||
+ strnlen(from_absolute, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ EPRINTF("invalid input; max path length is %d\n",
+ MAX_NAME_LEN);
+ *err = -ENAMETOOLONG;
+ goto out;
+ }
+
+ /* count nodes in source path */
+ from_nodes = count_nodes(from_absolute);
+
+ /* count nodes in common */
+ common = count_common_nodes(to_absolute + 1, from_absolute + 1);
+ if (common < 0) {
+ EPRINTF("failed to count common nodes of %s and %s: %d\n",
+ to_absolute, from_absolute, common);
+ *err = common;
+ goto out;
+ }
+
+ /* move up to common node */
+ up = up_nodes(from_nodes - common - 1);
+ if (!up) {
+ EPRINTF("failed to allocate relative path for %s: %d\n",
+ from_absolute, -ENOMEM);
+ *err = -ENOMEM;
+ goto out;
+ }
+
+ /* get path from common node to target */
+ common_target_path = node_offset(to_absolute, common + 1);
+ if (!common_target_path) {
+ EPRINTF("failed to find common target path to %s: %d\n",
+ to_absolute, -EINVAL);
+ *err = -EINVAL;
+ goto out;
+ }
+
+ /* get relative path */
+ if (asprintf(&relative_path, "%s%s", up, common_target_path) == -1) {
+ EPRINTF("failed to construct final path %s%s: %d\n",
+ up, common_target_path, -ENOMEM);
+ relative_path = NULL;
+ *err = -ENOMEM;
+ goto out;
+ }
+
+out:
+ sfree(up);
+ sfree(to_absolute);
+ sfree(from_absolute);
+
+ return relative_path;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+// allow the VHD timestamp to be at most this many seconds into the future to
+// account for time skew with NFS servers
+#define TIMESTAMP_MAX_SLACK 1800
+
+static int
+vhd_util_check_zeros(void *buf, size_t size)
+{
+ int i;
+ char *p;
+
+ p = buf;
+ for (i = 0; i < size; i++)
+ if (p[i])
+ return i;
+
+ return 0;
+}
+
+static int
+vhd_util_check_footer_opened(vhd_footer_t *footer)
+{
+ int i, n;
+ uint32_t *buf;
+
+ buf = (uint32_t *)footer;
+ n = sizeof(*footer) / sizeof(uint32_t);
+
+ for (i = 0; i < n; i++)
+ if (buf[i] != 0xc7c7c7c7)
+ return 0;
+
+ return 1;
+}
+
+static char *
+vhd_util_check_validate_footer(vhd_footer_t *footer)
+{
+ int size;
+ uint32_t checksum, now;
+
+ size = sizeof(footer->cookie);
+ if (memcmp(footer->cookie, HD_COOKIE, size))
+ return "invalid cookie";
+
+ checksum = vhd_checksum_footer(footer);
+ if (checksum != footer->checksum) {
+ if (footer->hidden &&
+ !strncmp(footer->crtr_app, "tap", 3) &&
+ (footer->crtr_ver == VHD_VERSION(0, 1) ||
+ footer->crtr_ver == VHD_VERSION(1, 1))) {
+ char tmp = footer->hidden;
+ footer->hidden = 0;
+ checksum = vhd_checksum_footer(footer);
+ footer->hidden = tmp;
+
+ if (checksum == footer->checksum)
+ goto ok;
+ }
+
+ return "invalid checksum";
+ }
+
+ok:
+ if (!(footer->features & HD_RESERVED))
+ return "invalid 'reserved' feature";
+
+ if (footer->features & ~(HD_TEMPORARY | HD_RESERVED))
+ return "invalid extra features";
+
+ if (footer->ff_version != HD_FF_VERSION)
+ return "invalid file format version";
+
+ if (footer->type != HD_TYPE_DYNAMIC &&
+ footer->type != HD_TYPE_DIFF &&
+ footer->data_offset != ~(0ULL))
+ return "invalid data offset";
+
+ now = vhd_time(time(NULL));
+ if (footer->timestamp > now + TIMESTAMP_MAX_SLACK)
+ return "creation time in future";
+
+ if (!strncmp(footer->crtr_app, "tap", 3) &&
+ footer->crtr_ver > VHD_CURRENT_VERSION)
+ return "unsupported tap creator version";
+
+ if (vhd_chs(footer->curr_size) < footer->geometry)
+ return "geometry too large";
+
+ if (footer->type != HD_TYPE_FIXED &&
+ footer->type != HD_TYPE_DYNAMIC &&
+ footer->type != HD_TYPE_DIFF)
+ return "invalid type";
+
+ if (footer->saved && footer->saved != 1)
+ return "invalid 'saved' state";
+
+ if (footer->hidden && footer->hidden != 1)
+ return "invalid 'hidden' state";
+
+ if (vhd_util_check_zeros(footer->reserved,
+ sizeof(footer->reserved)))
+ return "invalid 'reserved' bits";
+
+ return NULL;
+}
+
+static char *
+vhd_util_check_validate_header(int fd, vhd_header_t *header)
+{
+ off_t eof;
+ int i, cnt, size;
+ uint32_t checksum;
+
+ size = sizeof(header->cookie);
+ if (memcmp(header->cookie, DD_COOKIE, size))
+ return "invalid cookie";
+
+ checksum = vhd_checksum_header(header);
+ if (checksum != header->checksum)
+ return "invalid checksum";
+
+ if (header->hdr_ver != 0x00010000)
+ return "invalid header version";
+
+ if (header->data_offset != ~(0ULL))
+ return "invalid data offset";
+
+ eof = lseek(fd, 0, SEEK_END);
+ if (eof == (off_t)-1)
+ return "error finding eof";
+
+ if (header->table_offset <= 0 ||
+ header->table_offset % 512 ||
+ (header->table_offset +
+ (header->max_bat_size * sizeof(uint32_t)) >
+ eof - sizeof(vhd_footer_t)))
+ return "invalid table offset";
+
+ for (cnt = 0, i = 0; i < sizeof(header->block_size) * 8; i++)
+ if ((header->block_size >> i) & 1)
+ cnt++;
+
+ if (cnt != 1)
+ return "invalid block size";
+
+ if (header->res1)
+ return "invalid reserved bits";
+
+ if (vhd_util_check_zeros(header->res2, sizeof(header->res2)))
+ return "invalid reserved bits";
+
+ return NULL;
+}
+
+static char *
+vhd_util_check_validate_differencing_header(vhd_context_t *vhd)
+{
+ vhd_header_t *header;
+
+ header = &vhd->header;
+
+ if (vhd->footer.type == HD_TYPE_DIFF) {
+ char *parent;
+ uint32_t now;
+
+ now = vhd_time(time(NULL));
+ if (header->prt_ts > now + TIMESTAMP_MAX_SLACK)
+ return "parent creation time in future";
+
+ if (vhd_header_decode_parent(vhd, header, &parent))
+ return "invalid parent name";
+
+ free(parent);
+ } else {
+ if (vhd_util_check_zeros(header->prt_name,
+ sizeof(header->prt_name)))
+ return "invalid non-null parent name";
+
+ if (vhd_util_check_zeros(header->loc, sizeof(header->loc)))
+ return "invalid non-null parent locators";
+
+ if (!vhd_uuid_is_nil(&header->prt_uuid))
+ return "invalid non-null parent uuid";
+
+ if (header->prt_ts)
+ return "invalid non-zero parent timestamp";
+ }
+
+ return NULL;
+}
+
+static char *
+vhd_util_check_validate_batmap(vhd_context_t *vhd, vhd_batmap_t *batmap)
+{
+ int size;
+ off_t eof;
+ uint32_t checksum;
+
+ size = sizeof(batmap->header.cookie);
+ if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, size))
+ return "invalid cookie";
+
+ if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION)
+ return "unsupported batmap version";
+
+ checksum = vhd_checksum_batmap(batmap);
+ if (checksum != batmap->header.checksum)
+ return "invalid checksum";
+
+ if (!batmap->header.batmap_size)
+ return "invalid size zero";
+
+ eof = lseek(vhd->fd, 0, SEEK_END);
+ if (eof == (off_t)-1)
+ return "error finding eof";
+
+ if (!batmap->header.batmap_offset ||
+ batmap->header.batmap_offset % 512)
+ return "invalid batmap offset";
+
+ if ((batmap->header.batmap_offset +
+ vhd_sectors_to_bytes(batmap->header.batmap_size)) >
+ eof - sizeof(vhd_footer_t))
+ return "invalid batmap size";
+
+ return NULL;
+}
+
+static char *
+vhd_util_check_validate_parent_locator(vhd_context_t *vhd,
+ vhd_parent_locator_t *loc)
+{
+ off_t eof;
+
+ if (vhd_validate_platform_code(loc->code))
+ return "invalid platform code";
+
+ if (loc->code == PLAT_CODE_NONE) {
+ if (vhd_util_check_zeros(loc, sizeof(*loc)))
+ return "non-zero locator";
+
+ return NULL;
+ }
+
+ if (!loc->data_offset)
+ return "invalid data offset";
+
+ if (!loc->data_space)
+ return "invalid data space";
+
+ if (!loc->data_len)
+ return "invalid data length";
+
+ eof = lseek(vhd->fd, 0, SEEK_END);
+ if (eof == (off_t)-1)
+ return "error finding eof";
+
+ if (loc->data_offset + vhd_parent_locator_size(loc) >
+ eof - sizeof(vhd_footer_t))
+ return "invalid size";
+
+ if (loc->res)
+ return "invalid reserved bits";
+
+ return NULL;
+}
+
+static const char *
+vhd_util_check_validate_parent(vhd_context_t *vhd, const char *ppath)
+{
+ const char *msg;
+ vhd_context_t parent;
+ uint32_t status;
+
+ msg = NULL;
+
+ if (vhd_parent_raw(vhd))
+ return msg;
+
+ if (vhd_open(&parent, ppath,
+ VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED))
+ return "error opening parent";
+
+ if (vhd_uuid_compare(&vhd->header.prt_uuid, &parent.footer.uuid)) {
+ msg = "invalid parent uuid";
+ goto out;
+ }
+
+out:
+ vhd_close(&parent);
+ return msg;
+}
+
+static int
+vhd_util_check_footer(int fd, vhd_footer_t *footer, int ignore)
+{
+ size_t size;
+ int err, opened;
+ char *msg, *buf = NULL;
+ off_t eof, off;
+ vhd_footer_t primary, backup;
+
+ memset(&primary, 0, sizeof(primary));
+ memset(&backup, 0, sizeof(backup));
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, sizeof(primary));
+ if (err) {
+ printf("error allocating buffer: %d\n", err);
+ return -err;
+ }
+
+ memset(buf, 0, sizeof(primary));
+
+ eof = lseek(fd, 0, SEEK_END);
+ if (eof == (off_t)-1) {
+ err = -errno;
+ printf("error calculating end of file: %d\n", err);
+ goto out;
+ }
+
+ size = ((eof % 512) ? 511 : 512);
+ eof = lseek(fd, eof - size, SEEK_SET);
+ if (eof == (off_t)-1) {
+ err = -errno;
+ printf("error calculating end of file: %d\n", err);
+ goto out;
+ }
+
+ err = read(fd, buf, 512);
+ if (err != size) {
+ err = (errno ? -errno : -EIO);
+ printf("error reading primary footer: %d\n", err);
+ goto out;
+ }
+
+ memcpy(&primary, buf, sizeof(primary));
+ opened = vhd_util_check_footer_opened(&primary);
+ vhd_footer_in(&primary);
+
+ msg = vhd_util_check_validate_footer(&primary);
+ if (msg) {
+ if (opened && ignore)
+ goto check_backup;
+
+ err = -EINVAL;
+ printf("primary footer invalid: %s\n", msg);
+ goto out;
+ }
+
+ if (primary.type == HD_TYPE_FIXED) {
+ err = 0;
+ goto out;
+ }
+
+check_backup:
+ off = lseek(fd, 0, SEEK_SET);
+ if (off == (off_t)-1) {
+ err = -errno;
+ printf("error seeking to backup footer: %d\n", err);
+ goto out;
+ }
+
+ size = 512;
+ memset(buf, 0, sizeof(primary));
+
+ err = read(fd, buf, size);
+ if (err != size) {
+ err = (errno ? -errno : -EIO);
+ printf("error reading backup footer: %d\n", err);
+ goto out;
+ }
+
+ memcpy(&backup, buf, sizeof(backup));
+ vhd_footer_in(&backup);
+
+ msg = vhd_util_check_validate_footer(&backup);
+ if (msg) {
+ err = -EINVAL;
+ printf("backup footer invalid: %s\n", msg);
+ goto out;
+ }
+
+ if (memcmp(&primary, &backup, sizeof(primary))) {
+ if (opened && ignore) {
+ memcpy(&primary, &backup, sizeof(primary));
+ goto ok;
+ }
+
+ if (backup.hidden &&
+ !strncmp(backup.crtr_app, "tap", 3) &&
+ (backup.crtr_ver == VHD_VERSION(0, 1) ||
+ backup.crtr_ver == VHD_VERSION(1, 1))) {
+ char cmp, tmp = backup.hidden;
+ backup.hidden = 0;
+ cmp = memcmp(&primary, &backup, sizeof(primary));
+ backup.hidden = tmp;
+ if (!cmp)
+ goto ok;
+ }
+
+ err = -EINVAL;
+ printf("primary and backup footers do not match\n");
+ goto out;
+ }
+
+ok:
+ err = 0;
+ memcpy(footer, &primary, sizeof(primary));
+
+out:
+ free(buf);
+ return err;
+}
+
+static int
+vhd_util_check_header(int fd, vhd_footer_t *footer)
+{
+ int err;
+ off_t off;
+ char *msg, *buf;
+ vhd_header_t header;
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, sizeof(header));
+ if (err) {
+ printf("error allocating header: %d\n", err);
+ return err;
+ }
+
+ off = footer->data_offset;
+ off = lseek(fd, off, SEEK_SET);
+ if (off == (off_t)-1) {
+ err = -errno;
+ printf("error seeking to header: %d\n", err);
+ goto out;
+ }
+
+ err = read(fd, buf, sizeof(header));
+ if (err != sizeof(header)) {
+ err = (errno ? -errno : -EIO);
+ printf("error reading header: %d\n", err);
+ goto out;
+ }
+
+ memcpy(&header, buf, sizeof(header));
+ vhd_header_in(&header);
+
+ msg = vhd_util_check_validate_header(fd, &header);
+ if (msg) {
+ err = -EINVAL;
+ printf("header is invalid: %s\n", msg);
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ free(buf);
+ return err;
+}
+
+static int
+vhd_util_check_differencing_header(vhd_context_t *vhd)
+{
+ char *msg;
+
+ msg = vhd_util_check_validate_differencing_header(vhd);
+ if (msg) {
+ printf("differencing header is invalid: %s\n", msg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+vhd_util_check_bat(vhd_context_t *vhd)
+{
+ off_t eof, eoh;
+ int i, j, err, block_size;
+
+ err = vhd_seek(vhd, 0, SEEK_END);
+ if (err) {
+ printf("error calculating eof: %d\n", err);
+ return err;
+ }
+
+ eof = vhd_position(vhd);
+ if (eof == (off_t)-1) {
+ printf("error calculating eof: %d\n", -errno);
+ return -errno;
+ }
+
+ /* adjust eof for vhds with short footers */
+ if (eof % 512) {
+ if (eof % 512 != 511) {
+ printf("invalid file size: 0x%"PRIx64"\n", eof);
+ return -EINVAL;
+ }
+
+ eof++;
+ }
+
+ err = vhd_get_bat(vhd);
+ if (err) {
+ printf("error reading bat: %d\n", err);
+ return err;
+ }
+
+ err = vhd_end_of_headers(vhd, &eoh);
+ if (err) {
+ printf("error calculating end of metadata: %d\n", err);
+ return err;
+ }
+
+ eof -= sizeof(vhd_footer_t);
+ eof >>= VHD_SECTOR_SHIFT;
+ eoh >>= VHD_SECTOR_SHIFT;
+ block_size = vhd->spb + vhd->bm_secs;
+
+ for (i = 0; i < vhd->header.max_bat_size; i++) {
+ uint32_t off = vhd->bat.bat[i];
+ if (off == DD_BLK_UNUSED)
+ continue;
+
+ if (off < eoh) {
+ printf("block %d (offset 0x%x) clobbers headers\n",
+ i, off);
+ return -EINVAL;
+ }
+
+ if (off + block_size > eof) {
+ printf("block %d (offset 0x%x) clobbers footer\n",
+ i, off);
+ return -EINVAL;
+ }
+
+ for (j = 0; j < vhd->header.max_bat_size; j++) {
+ uint32_t joff = vhd->bat.bat[j];
+
+ if (i == j)
+ continue;
+
+ if (joff == DD_BLK_UNUSED)
+ continue;
+
+ if (off == joff)
+ err = -EINVAL;
+
+ if (off > joff && off < joff + block_size)
+ err = -EINVAL;
+
+ if (off + block_size > joff &&
+ off + block_size < joff + block_size)
+ err = -EINVAL;
+
+ if (err) {
+ printf("block %d (offset 0x%x) clobbers "
+ "block %d (offset 0x%x)\n",
+ i, off, j, joff);
+ return err;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int
+vhd_util_check_batmap(vhd_context_t *vhd)
+{
+ char *msg;
+ int i, err;
+
+ err = vhd_get_bat(vhd);
+ if (err) {
+ printf("error reading bat: %d\n", err);
+ return err;
+ }
+
+ err = vhd_get_batmap(vhd);
+ if (err) {
+ printf("error reading batmap: %d\n", err);
+ return err;
+ }
+
+ msg = vhd_util_check_validate_batmap(vhd, &vhd->batmap);
+ if (msg) {
+ printf("batmap is invalid: %s\n", msg);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < vhd->header.max_bat_size; i++) {
+ if (!vhd_batmap_test(vhd, &vhd->batmap, i))
+ continue;
+
+ if (vhd->bat.bat[i] == DD_BLK_UNUSED) {
+ printf("batmap shows unallocated block %d full\n", i);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vhd_util_check_parent_locators(vhd_context_t *vhd)
+{
+ int i, n, err;
+ vhd_parent_locator_t *loc;
+ char *file, *ppath, *location, *pname;
+ const char *msg;
+ int mac, macx, w2ku, w2ru, wi2r, wi2k, found;
+
+ mac = 0;
+ macx = 0;
+ w2ku = 0;
+ w2ru = 0;
+ wi2r = 0;
+ wi2k = 0;
+ found = 0;
+ pname = NULL;
+ ppath = NULL;
+ location = NULL;
+
+ err = vhd_header_decode_parent(vhd, &vhd->header, &pname);
+ if (err) {
+ printf("error decoding parent name: %d\n", err);
+ return err;
+ }
+
+ n = sizeof(vhd->header.loc) / sizeof(vhd->header.loc[0]);
+ for (i = 0; i < n; i++) {
+ ppath = NULL;
+ location = NULL;
+ loc = vhd->header.loc + i;
+
+ msg = vhd_util_check_validate_parent_locator(vhd, loc);
+ if (msg) {
+ err = -EINVAL;
+ printf("invalid parent locator %d: %s\n", i, msg);
+ goto out;
+ }
+
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ switch (loc->code) {
+ case PLAT_CODE_MACX:
+ if (macx++)
+ goto dup;
+ break;
+
+ case PLAT_CODE_MAC:
+ if (mac++)
+ goto dup;
+ break;
+
+ case PLAT_CODE_W2KU:
+ if (w2ku++)
+ goto dup;
+ break;
+
+ case PLAT_CODE_W2RU:
+ if (w2ru++)
+ goto dup;
+ break;
+
+ case PLAT_CODE_WI2R:
+ if (wi2r++)
+ goto dup;
+ break;
+
+ case PLAT_CODE_WI2K:
+ if (wi2k++)
+ goto dup;
+ break;
+
+ default:
+ err = -EINVAL;
+ printf("invalid platform code for locator %d\n", i);
+ goto out;
+ }
+
+ if (loc->code != PLAT_CODE_MACX &&
+ loc->code != PLAT_CODE_W2RU &&
+ loc->code != PLAT_CODE_W2KU)
+ continue;
+
+ err = vhd_parent_locator_read(vhd, loc, &ppath);
+ if (err) {
+ printf("error reading parent locator %d: %d\n", i, err);
+ goto out;
+ }
+
+ file = basename(ppath);
+ if (strcmp(pname, file)) {
+ err = -EINVAL;
+ printf("parent locator %d name (%s) does not match "
+ "header name (%s)\n", i, file, pname);
+ goto out;
+ }
+
+ err = vhd_find_parent(vhd, ppath, &location);
+ if (err) {
+ printf("error resolving %s: %d\n", ppath, err);
+ goto out;
+ }
+
+ err = access(location, R_OK);
+ if (err && loc->code == PLAT_CODE_MACX) {
+ err = -errno;
+ printf("parent locator %d points to missing file %s "
+ "(resolved to %s)\n", i, ppath, location);
+ goto out;
+ }
+
+ msg = vhd_util_check_validate_parent(vhd, location);
+ if (msg) {
+ err = -EINVAL;
+ printf("invalid parent %s: %s\n", location, msg);
+ goto out;
+ }
+
+ found++;
+ free(ppath);
+ free(location);
+ ppath = NULL;
+ location = NULL;
+
+ continue;
+
+ dup:
+ printf("duplicate platform code in locator %d: 0x%x\n",
+ i, loc->code);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!found) {
+ err = -EINVAL;
+ printf("could not find parent %s\n", pname);
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ free(pname);
+ free(ppath);
+ free(location);
+ return err;
+}
+
+static void
+vhd_util_dump_headers(const char *name)
+{
+ char *argv[] = { "read", "-p", "-n", (char *)name };
+ int argc = sizeof(argv) / sizeof(argv[0]);
+
+ printf("%s appears invalid; dumping metadata\n", name);
+ vhd_util_read(argc, argv);
+}
+
+static int
+vhd_util_check_vhd(const char *name, int ignore)
+{
+ int fd, err;
+ vhd_context_t vhd;
+ struct stat stats;
+ vhd_footer_t footer;
+
+ fd = -1;
+ memset(&vhd, 0, sizeof(vhd));
+ memset(&footer, 0, sizeof(footer));
+
+ err = stat(name, &stats);
+ if (err == -1) {
+ printf("cannot stat %s: %d\n", name, errno);
+ return -errno;
+ }
+
+ if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+ printf("%s is not a regular file or block device\n", name);
+ return -EINVAL;
+ }
+
+ fd = open(name, O_RDONLY | O_DIRECT | O_LARGEFILE);
+ if (fd == -1) {
+ printf("error opening %s\n", name);
+ return -errno;
+ }
+
+ err = vhd_util_check_footer(fd, &footer, ignore);
+ if (err)
+ goto out;
+
+ if (footer.type != HD_TYPE_DYNAMIC && footer.type != HD_TYPE_DIFF)
+ goto out;
+
+ err = vhd_util_check_header(fd, &footer);
+ if (err)
+ goto out;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+ if (err)
+ goto out;
+
+ err = vhd_util_check_differencing_header(&vhd);
+ if (err)
+ goto out;
+
+ err = vhd_util_check_bat(&vhd);
+ if (err)
+ goto out;
+
+ if (vhd_has_batmap(&vhd)) {
+ err = vhd_util_check_batmap(&vhd);
+ if (err)
+ goto out;
+ }
+
+ if (vhd.footer.type == HD_TYPE_DIFF) {
+ err = vhd_util_check_parent_locators(&vhd);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+ printf("%s is valid\n", name);
+
+out:
+ if (err)
+ vhd_util_dump_headers(name);
+ if (fd != -1)
+ close(fd);
+ vhd_close(&vhd);
+ return err;
+}
+
+static int
+vhd_util_check_parents(const char *name, int ignore)
+{
+ int err;
+ vhd_context_t vhd;
+ char *cur, *parent;
+
+ cur = (char *)name;
+
+ for (;;) {
+ err = vhd_open(&vhd, cur,
+ VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+ if (err)
+ goto out;
+
+ if (vhd.footer.type != HD_TYPE_DIFF || vhd_parent_raw(&vhd)) {
+ vhd_close(&vhd);
+ goto out;
+ }
+
+ err = vhd_parent_locator_get(&vhd, &parent);
+ vhd_close(&vhd);
+
+ if (err) {
+ printf("error getting parent: %d\n", err);
+ goto out;
+ }
+
+ if (cur != name)
+ free(cur);
+ cur = parent;
+
+ err = vhd_util_check_vhd(cur, ignore);
+ if (err)
+ goto out;
+ }
+
+out:
+ if (err)
+ printf("error checking parents: %d\n", err);
+ if (cur != name)
+ free(cur);
+ return err;
+}
+
+int
+vhd_util_check(int argc, char **argv)
+{
+ char *name;
+ vhd_context_t vhd;
+ int c, err, ignore, parents;
+
+ if (!argc || !argv) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ ignore = 0;
+ parents = 0;
+ name = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:iph")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'i':
+ ignore = 1;
+ break;
+ case 'p':
+ parents = 1;
+ break;
+ case 'h':
+ err = 0;
+ goto usage;
+ default:
+ err = -EINVAL;
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ err = vhd_util_check_vhd(name, ignore);
+ if (err)
+ goto out;
+
+ if (parents)
+ err = vhd_util_check_parents(name, ignore);
+
+out:
+ return err;
+
+usage:
+ printf("options: -n <file> [-i ignore missing primary footers] "
+ "[-p check parents] [-h help]\n");
+ return err;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+static int
+__raw_io_write(int fd, char* buf, uint64_t sec, uint32_t secs)
+{
+ off_t off;
+ size_t ret;
+
+ errno = 0;
+ off = lseek(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
+ if (off == (off_t)-1) {
+ printf("raw parent: seek(0x%08"PRIx64") failed: %d\n",
+ vhd_sectors_to_bytes(sec), -errno);
+ return -errno;
+ }
+
+ ret = write(fd, buf, vhd_sectors_to_bytes(secs));
+ if (ret == vhd_sectors_to_bytes(secs))
+ return 0;
+
+ printf("raw parent: write of 0x%"PRIx64" returned %zd, errno: %d\n",
+ vhd_sectors_to_bytes(secs), ret, -errno);
+ return (errno ? -errno : -EIO);
+}
+
+/*
+ * Use 'parent' if the parent is VHD, and 'parent_fd' if the parent is raw
+ */
+static int
+vhd_util_coalesce_block(vhd_context_t *vhd, vhd_context_t *parent,
+ int parent_fd, uint64_t block)
+{
+ int i, err;
+ char *buf, *map;
+ uint64_t sec, secs;
+
+ buf = NULL;
+ map = NULL;
+ sec = block * vhd->spb;
+
+ if (vhd->bat.bat[block] == DD_BLK_UNUSED)
+ return 0;
+
+ err = posix_memalign((void **)&buf, 4096, vhd->header.block_size);
+ if (err)
+ return -err;
+
+ err = vhd_io_read(vhd, buf, sec, vhd->spb);
+ if (err)
+ goto done;
+
+ if (vhd_has_batmap(vhd) && vhd_batmap_test(vhd, &vhd->batmap, block)) {
+ if (parent->file)
+ err = vhd_io_write(parent, buf, sec, vhd->spb);
+ else
+ err = __raw_io_write(parent_fd, buf, sec, vhd->spb);
+ goto done;
+ }
+
+ err = vhd_read_bitmap(vhd, block, &map);
+ if (err)
+ goto done;
+
+ for (i = 0; i < vhd->spb; i++) {
+ if (!vhd_bitmap_test(vhd, map, i))
+ continue;
+
+ for (secs = 0; i + secs < vhd->spb; secs++)
+ if (!vhd_bitmap_test(vhd, map, i + secs))
+ break;
+
+ if (parent->file)
+ err = vhd_io_write(parent,
+ buf + vhd_sectors_to_bytes(i),
+ sec + i, secs);
+ else
+ err = __raw_io_write(parent_fd,
+ buf + vhd_sectors_to_bytes(i),
+ sec + i, secs);
+ if (err)
+ goto done;
+
+ i += secs;
+ }
+
+ err = 0;
+
+done:
+ free(buf);
+ free(map);
+ return err;
+}
+
+int
+vhd_util_coalesce(int argc, char **argv)
+{
+ int err, c;
+ uint64_t i;
+ char *name, *pname;
+ vhd_context_t vhd, parent;
+ int parent_fd = -1;
+
+ name = NULL;
+ pname = NULL;
+ parent.file = NULL;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc)
+ goto usage;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ err = vhd_parent_locator_get(&vhd, &pname);
+ if (err) {
+ printf("error finding %s parent: %d\n", name, err);
+ vhd_close(&vhd);
+ return err;
+ }
+
+ if (vhd_parent_raw(&vhd)) {
+ parent_fd = open(pname, O_RDWR | O_DIRECT | O_LARGEFILE, 0644);
+ if (parent_fd == -1) {
+ err = -errno;
+ printf("failed to open parent %s: %d\n", pname, err);
+ vhd_close(&vhd);
+ return err;
+ }
+ } else {
+ err = vhd_open(&parent, pname, VHD_OPEN_RDWR);
+ if (err) {
+ printf("error opening %s: %d\n", pname, err);
+ free(pname);
+ vhd_close(&vhd);
+ return err;
+ }
+ }
+
+ err = vhd_get_bat(&vhd);
+ if (err)
+ goto done;
+
+ if (vhd_has_batmap(&vhd)) {
+ err = vhd_get_batmap(&vhd);
+ if (err)
+ goto done;
+ }
+
+ for (i = 0; i < vhd.bat.entries; i++) {
+ err = vhd_util_coalesce_block(&vhd, &parent, parent_fd, i);
+ if (err)
+ goto done;
+ }
+
+ err = 0;
+
+ done:
+ free(pname);
+ vhd_close(&vhd);
+ if (parent.file)
+ vhd_close(&parent);
+ else
+ close(parent_fd);
+ return err;
+
+usage:
+ printf("options: <-n name> [-h help]\n");
+ return -EINVAL;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_create(int argc, char **argv)
+{
+ char *name;
+ uint64_t size;
+ int c, sparse, err;
+ vhd_flag_creat_t flags;
+
+ err = -EINVAL;
+ size = 0;
+ sparse = 1;
+ name = NULL;
+ flags = 0;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:s:rh")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 's':
+ err = 0;
+ size = strtoull(optarg, NULL, 10);
+ break;
+ case 'r':
+ sparse = 0;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (err || !name || optind != argc)
+ goto usage;
+
+ return vhd_create(name, size << 20,
+ (sparse ? HD_TYPE_DYNAMIC : HD_TYPE_FIXED),
+ flags);
+
+usage:
+ printf("options: <-n name> <-s size (MB)> [-r reserve] [-h help]\n");
+ return -EINVAL;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_fill(int argc, char **argv)
+{
+ int err, c;
+ char *buf, *name;
+ vhd_context_t vhd;
+ uint64_t i, sec, secs;
+
+ buf = NULL;
+ name = NULL;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc)
+ goto usage;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ err = vhd_get_bat(&vhd);
+ if (err)
+ goto done;
+
+ err = posix_memalign((void **)&buf, 4096, vhd.header.block_size);
+ if (err) {
+ err = -err;
+ goto done;
+ }
+
+ sec = 0;
+ secs = vhd.header.block_size >> VHD_SECTOR_SHIFT;
+
+ for (i = 0; i < vhd.header.max_bat_size; i++) {
+ err = vhd_io_read(&vhd, buf, sec, secs);
+ if (err)
+ goto done;
+
+ err = vhd_io_write(&vhd, buf, sec, secs);
+ if (err)
+ goto done;
+
+ sec += secs;
+ }
+
+ err = 0;
+
+ done:
+ free(buf);
+ vhd_close(&vhd);
+ return err;
+
+usage:
+ printf("options: <-n name> [-h help]\n");
+ return -EINVAL;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Altering operations:
+ *
+ * 1. Change the parent pointer to another file.
+ * 2. Change the size of the file containing the VHD image. This does NOT
+ * affect the VHD disk capacity, only the physical size of the file containing
+ * the VHD. Naturally, it is not possible to set the file size to be less than
+ * the what VHD utilizes.
+ * The operation doesn't actually change the file size, but it writes the
+ * footer in the right location such that resizing the file (manually, as a
+ * separate step) will produce the correct results. If the new file size is
+ * greater than the current file size, the file must first be expanded and then
+ * altered with this operation. If the new size is smaller than the current
+ * size, the VHD must first be altered with this operation and then the file
+ * must be shrunk. Failing to resize the file will result in a corrupted VHD.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+TEST_FAIL_EXTERN_VARS;
+
+int
+vhd_util_modify(int argc, char **argv)
+{
+ char *name;
+ vhd_context_t vhd;
+ int err, c, size, parent, parent_raw;
+ off_t newsize = 0;
+ char *newparent = NULL;
+
+ name = NULL;
+ size = 0;
+ parent = 0;
+ parent_raw = 0;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:s:p:mh")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 's':
+ size = 1;
+ errno = 0;
+ newsize = strtoll(optarg, NULL, 10);
+ if (errno) {
+ fprintf(stderr, "Invalid size '%s'\n", optarg);
+ goto usage;
+ }
+ break;
+ case 'p':
+ parent = 1;
+ newparent = optarg;
+ break;
+ case 'm':
+ parent_raw = 1;
+ break;
+
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc)
+ goto usage;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ if (size) {
+ err = vhd_set_phys_size(&vhd, newsize);
+ if (err)
+ printf("failed to set physical size to %"PRIu64":"
+ " %d\n", newsize, err);
+ }
+
+ if (parent) {
+ TEST_FAIL_AT(FAIL_REPARENT_BEGIN);
+ err = vhd_change_parent(&vhd, newparent, parent_raw);
+ if (err) {
+ printf("failed to set parent to '%s': %d\n",
+ newparent, err);
+ goto done;
+ }
+ TEST_FAIL_AT(FAIL_REPARENT_END);
+ }
+
+done:
+ vhd_close(&vhd);
+ return err;
+
+usage:
+ printf("*** Dangerous operations, use with care ***\n");
+ printf("options: <-n name> [-p NEW_PARENT set parent [-m raw]] "
+ "[-s NEW_SIZE set size] [-h help]\n");
+ return -EINVAL;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_query(int argc, char **argv)
+{
+ char *name;
+ vhd_context_t vhd;
+ off_t currsize;
+ int ret, err, c, size, physize, parent, fields, depth;
+
+ name = NULL;
+ size = 0;
+ physize = 0;
+ parent = 0;
+ fields = 0;
+ depth = 0;
+
+ if (!argc || !argv) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:vspfdh")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'v':
+ size = 1;
+ break;
+ case 's':
+ physize = 1;
+ break;
+ case 'p':
+ parent = 1;
+ break;
+ case 'f':
+ fields = 1;
+ break;
+ case 'd':
+ depth = 1;
+ break;
+ case 'h':
+ err = 0;
+ goto usage;
+ default:
+ err = -EINVAL;
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ if (size)
+ printf("%"PRIu64"\n", vhd.footer.curr_size >> 20);
+
+ if (physize) {
+ err = vhd_get_phys_size(&vhd, &currsize);
+ if (err)
+ printf("failed to get physical size: %d\n", err);
+ else
+ printf("%"PRIu64"\n", currsize);
+ }
+
+ if (parent) {
+ ret = 0;
+
+ if (vhd.footer.type != HD_TYPE_DIFF)
+ printf("%s has no parent\n", name);
+ else {
+ char *pname;
+
+ ret = vhd_parent_locator_get(&vhd, &pname);
+ if (ret)
+ printf("query failed\n");
+ else {
+ printf("%s\n", pname);
+ free(pname);
+ }
+ }
+
+ err = (err ? : ret);
+ }
+
+ if (fields) {
+ int hidden;
+
+ ret = vhd_hidden(&vhd, &hidden);
+ if (ret)
+ printf("error checking 'hidden' field: %d\n", ret);
+ else
+ printf("hidden: %d\n", hidden);
+
+ err = (err ? : ret);
+ }
+
+ if (depth) {
+ int length;
+
+ ret = vhd_chain_depth(&vhd, &length);
+ if (ret)
+ printf("error checking chain depth: %d\n", ret);
+ else
+ printf("chain depth: %d\n", length);
+
+ err = (err ? : ret);
+ }
+
+ vhd_close(&vhd);
+ return err;
+
+usage:
+ printf("options: <-n name> [-v print virtual size (in MB)] "
+ "[-s print physical utilization (bytes)] [-p print parent] "
+ "[-f print fields] [-d print chain depth] [-h help]\n");
+ return err;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+#define nsize 15
+static char nbuf[nsize];
+
+static inline char *
+__xconv(uint64_t num)
+{
+ snprintf(nbuf, nsize, "%#" PRIx64 , num);
+ return nbuf;
+}
+
+static inline char *
+__dconv(uint64_t num)
+{
+ snprintf(nbuf, nsize, "%" PRIu64, num);
+ return nbuf;
+}
+
+#define conv(hex, num) \
+ (hex ? __xconv((uint64_t)num) : __dconv((uint64_t)num))
+
+static void
+vhd_print_header(vhd_context_t *vhd, vhd_header_t *h, int hex)
+{
+ int err;
+ uint32_t cksm;
+ char uuid[39], time_str[26], cookie[9], out[512], *name;
+
+ printf("VHD Header Summary:\n-------------------\n");
+
+ snprintf(cookie, sizeof(cookie), "%s", h->cookie);
+ printf("Cookie : %s\n", cookie);
+
+ printf("Data offset (unusd) : %s\n", conv(hex, h->data_offset));
+ printf("Table offset : %s\n", conv(hex, h->table_offset));
+ printf("Header version : 0x%08x\n", h->hdr_ver);
+ printf("Max BAT size : %s\n", conv(hex, h->max_bat_size));
+ printf("Block size : %s ", conv(hex, h->block_size));
+ printf("(%s MB)\n", conv(hex, h->block_size >> 20));
+
+ err = vhd_header_decode_parent(vhd, h, &name);
+ printf("Parent name : %s\n",
+ (err ? "failed to read name" : name));
+ free(name);
+
+ vhd_uuid_to_string(&h->prt_uuid, uuid, sizeof(uuid));
+ printf("Parent UUID : %s\n", uuid);
+
+ vhd_time_to_string(h->prt_ts, time_str);
+ printf("Parent timestamp : %s\n", time_str);
+
+ cksm = vhd_checksum_header(h);
+ printf("Checksum : 0x%x|0x%x (%s)\n", h->checksum, cksm,
+ h->checksum == cksm ? "Good!" : "Bad!");
+ printf("\n");
+}
+
+static void
+vhd_print_footer(vhd_footer_t *f, int hex)
+{
+ uint64_t c, h, s;
+ uint32_t ff_maj, ff_min, cr_maj, cr_min, cksm, cksm_save;
+ char time_str[26], creator[5], uuid[39], cookie[9];
+
+ printf("VHD Footer Summary:\n-------------------\n");
+
+ snprintf(cookie, sizeof(cookie), "%s", f->cookie);
+ printf("Cookie : %s\n", cookie);
+
+ printf("Features : (0x%08x) %s%s\n", f->features,
+ (f->features & HD_TEMPORARY) ? "<TEMP>" : "",
+ (f->features & HD_RESERVED) ? "<RESV>" : "");
+
+ ff_maj = f->ff_version >> 16;
+ ff_min = f->ff_version & 0xffff;
+ printf("File format version : Major: %d, Minor: %d\n",
+ ff_maj, ff_min);
+
+ printf("Data offset : %s\n", conv(hex, f->data_offset));
+
+ vhd_time_to_string(f->timestamp, time_str);
+ printf("Timestamp : %s\n", time_str);
+
+ memcpy(creator, f->crtr_app, 4);
+ creator[4] = '\0';
+ printf("Creator Application : '%s'\n", creator);
+
+ cr_maj = f->crtr_ver >> 16;
+ cr_min = f->crtr_ver & 0xffff;
+ printf("Creator version : Major: %d, Minor: %d\n",
+ cr_maj, cr_min);
+
+ printf("Creator OS : %s\n",
+ ((f->crtr_os == HD_CR_OS_WINDOWS) ? "Windows" :
+ ((f->crtr_os == HD_CR_OS_MACINTOSH) ? "Macintosh" :
+ "Unknown!")));
+
+ printf("Original disk size : %s MB ", conv(hex, f->orig_size >> 20));
+ printf("(%s Bytes)\n", conv(hex, f->orig_size));
+
+ printf("Current disk size : %s MB ", conv(hex, f->curr_size >> 20));
+ printf("(%s Bytes)\n", conv(hex, f->curr_size));
+
+ c = f->geometry >> 16;
+ h = (f->geometry & 0x0000FF00) >> 8;
+ s = f->geometry & 0x000000FF;
+ printf("Geometry : Cyl: %s, ", conv(hex, c));
+ printf("Hds: %s, ", conv(hex, h));
+ printf("Sctrs: %s\n", conv(hex, s));
+ printf(" : = %s MB ", conv(hex, (c * h * s) >> 11));
+ printf("(%s Bytes)\n", conv(hex, c * h * s << 9));
+
+ printf("Disk type : %s\n",
+ f->type <= HD_TYPE_MAX ?
+ HD_TYPE_STR[f->type] : "Unknown type!\n");
+
+ cksm = vhd_checksum_footer(f);
+ printf("Checksum : 0x%x|0x%x (%s)\n", f->checksum, cksm,
+ f->checksum == cksm ? "Good!" : "Bad!");
+
+ vhd_uuid_to_string(&f->uuid, uuid, sizeof(uuid));
+ printf("UUID : %s\n", uuid);
+
+ printf("Saved state : %s\n", f->saved == 0 ? "No" : "Yes");
+ printf("Hidden : %d\n", f->hidden);
+ printf("\n");
+}
+
+static inline char *
+code_name(uint32_t code)
+{
+ switch(code) {
+ case PLAT_CODE_NONE:
+ return "PLAT_CODE_NONE";
+ case PLAT_CODE_WI2R:
+ return "PLAT_CODE_WI2R";
+ case PLAT_CODE_WI2K:
+ return "PLAT_CODE_WI2K";
+ case PLAT_CODE_W2RU:
+ return "PLAT_CODE_W2RU";
+ case PLAT_CODE_W2KU:
+ return "PLAT_CODE_W2KU";
+ case PLAT_CODE_MAC:
+ return "PLAT_CODE_MAC";
+ case PLAT_CODE_MACX:
+ return "PLAT_CODE_MACX";
+ default:
+ return "UNKOWN";
+ }
+}
+
+static void
+vhd_print_parent(vhd_context_t *vhd, vhd_parent_locator_t *loc)
+{
+ int err;
+ char *buf;
+
+ err = vhd_parent_locator_read(vhd, loc, &buf);
+ if (err) {
+ printf("failed to read parent name\n");
+ return;
+ }
+
+ printf(" decoded name : %s\n", buf);
+}
+
+static void
+vhd_print_parent_locators(vhd_context_t *vhd, int hex)
+{
+ int i, n;
+ vhd_parent_locator_t *loc;
+
+ printf("VHD Parent Locators:\n--------------------\n");
+
+ n = sizeof(vhd->header.loc) / sizeof(struct prt_loc);
+ for (i = 0; i < n; i++) {
+ loc = &vhd->header.loc[i];
+
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ printf("locator: : %d\n", i);
+ printf(" code : %s\n",
+ code_name(loc->code));
+ printf(" data_space : %s\n",
+ conv(hex, loc->data_space));
+ printf(" data_length : %s\n",
+ conv(hex, loc->data_len));
+ printf(" data_offset : %s\n",
+ conv(hex, loc->data_offset));
+ vhd_print_parent(vhd, loc);
+ printf("\n");
+ }
+}
+
+static void
+vhd_print_batmap_header(vhd_batmap_t *batmap, int hex)
+{
+ uint32_t cksm;
+
+ printf("VHD Batmap Summary:\n-------------------\n");
+ printf("Batmap offset : %s\n",
+ conv(hex, batmap->header.batmap_offset));
+ printf("Batmap size (secs) : %s\n",
+ conv(hex, batmap->header.batmap_size));
+ printf("Batmap version : 0x%08x\n",
+ batmap->header.batmap_version);
+
+ cksm = vhd_checksum_batmap(batmap);
+ printf("Checksum : 0x%x|0x%x (%s)\n",
+ batmap->header.checksum, cksm,
+ (batmap->header.checksum == cksm ? "Good!" : "Bad!"));
+ printf("\n");
+}
+
+static inline int
+check_block_range(vhd_context_t *vhd, uint64_t block, int hex)
+{
+ if (block > vhd->header.max_bat_size) {
+ fprintf(stderr, "block %s past end of file\n",
+ conv(hex, block));
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static int
+vhd_print_headers(vhd_context_t *vhd, int hex)
+{
+ int err;
+
+ vhd_print_footer(&vhd->footer, hex);
+
+ if (vhd_type_dynamic(vhd)) {
+ vhd_print_header(vhd, &vhd->header, hex);
+
+ if (vhd->footer.type == HD_TYPE_DIFF)
+ vhd_print_parent_locators(vhd, hex);
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_get_batmap(vhd);
+ if (err) {
+ printf("failed to get batmap header\n");
+ return err;
+ }
+
+ vhd_print_batmap_header(&vhd->batmap, hex);
+ }
+ }
+
+ return 0;
+}
+
+static int
+vhd_dump_headers(const char *name, int hex)
+{
+ vhd_context_t vhd;
+
+ libvhd_set_log_level(1);
+ memset(&vhd, 0, sizeof(vhd));
+
+ printf("\n%s appears invalid; dumping headers\n\n", name);
+
+ vhd.fd = open(name, O_DIRECT | O_LARGEFILE | O_RDONLY);
+ if (vhd.fd == -1)
+ return -errno;
+
+ vhd.file = strdup(name);
+
+ vhd_read_footer(&vhd, &vhd.footer);
+ vhd_read_header(&vhd, &vhd.header);
+
+ vhd_print_footer(&vhd.footer, hex);
+ vhd_print_header(&vhd, &vhd.header, hex);
+
+ close(vhd.fd);
+ free(vhd.file);
+
+ return 0;
+}
+
+static int
+vhd_print_logical_to_physical(vhd_context_t *vhd,
+ uint64_t sector, int count, int hex)
+{
+ int i;
+ uint32_t blk, lsec;
+ uint64_t cur, offset;
+
+ if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+ fprintf(stderr, "sector %s past end of file\n",
+ conv(hex, sector + count));
+ return -ERANGE;
+ }
+
+ for (i = 0; i < count; i++) {
+ cur = sector + i;
+ blk = cur / vhd->spb;
+ lsec = cur % vhd->spb;
+ offset = vhd->bat.bat[blk];
+
+ if (offset != DD_BLK_UNUSED) {
+ offset += lsec + 1;
+ offset = vhd_sectors_to_bytes(offset);
+ }
+
+ printf("logical sector %s: ", conv(hex, cur));
+ printf("block number: %s, ", conv(hex, blk));
+ printf("sector offset: %s, ", conv(hex, lsec));
+ printf("file offset: %s\n", (offset == DD_BLK_UNUSED ?
+ "not allocated" : conv(hex, offset)));
+ }
+
+ return 0;
+}
+
+static int
+vhd_print_bat(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+ int i;
+ uint64_t cur, offset;
+
+ if (check_block_range(vhd, block + count, hex))
+ return -ERANGE;
+
+ for (i = 0; i < count; i++) {
+ cur = block + i;
+ offset = vhd->bat.bat[cur];
+
+ printf("block: %s: ", conv(hex, cur));
+ printf("offset: %s\n",
+ (offset == DD_BLK_UNUSED ? "not allocated" :
+ conv(hex, vhd_sectors_to_bytes(offset))));
+ }
+
+ return 0;
+}
+
+static inline void
+write_full(int fd, void* buf, size_t count)
+{
+ ssize_t num_written = 0;
+ if (!buf) return;
+
+
+ while(count > 0) {
+
+ num_written = write(fd, buf, count);
+ if (num_written == -1) {
+ if (errno == EINTR)
+ continue;
+ else
+ return;
+ }
+
+ count -= num_written;
+ buf += num_written;
+ }
+}
+
+static int
+vhd_print_bitmap(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+ char *buf;
+ int i, err;
+ uint64_t cur;
+
+ if (check_block_range(vhd, block + count, hex))
+ return -ERANGE;
+
+ for (i = 0; i < count; i++) {
+ cur = block + i;
+
+ if (vhd->bat.bat[cur] == DD_BLK_UNUSED) {
+ printf("block %s not allocated\n", conv(hex, cur));
+ continue;
+ }
+
+ err = vhd_read_bitmap(vhd, cur, &buf);
+ if (err)
+ goto out;
+
+ write_full(STDOUT_FILENO, buf,
+ vhd_sectors_to_bytes(vhd->bm_secs));
+ free(buf);
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+static int
+vhd_test_bitmap(vhd_context_t *vhd, uint64_t sector, int count, int hex)
+{
+ char *buf;
+ uint64_t cur;
+ int i, err, bit;
+ uint32_t blk, bm_blk, sec;
+
+ if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+ printf("sector %s past end of file\n", conv(hex, sector));
+ return -ERANGE;
+ }
+
+ bm_blk = -1;
+ buf = NULL;
+
+ for (i = 0; i < count; i++) {
+ cur = sector + i;
+ blk = cur / vhd->spb;
+ sec = cur % vhd->spb;
+
+ if (blk != bm_blk) {
+ bm_blk = blk;
+ free(buf);
+ buf = NULL;
+
+ if (vhd->bat.bat[blk] != DD_BLK_UNUSED) {
+ err = vhd_read_bitmap(vhd, blk, &buf);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (vhd->bat.bat[blk] == DD_BLK_UNUSED)
+ bit = 0;
+ else
+ bit = vhd_bitmap_test(vhd, buf, blk);
+
+ print:
+ printf("block %s: ", conv(hex, blk));
+ printf("sec: %s: %d\n", conv(hex, sec), bit);
+ }
+
+ err = 0;
+ out:
+ free(buf);
+ return err;
+}
+
+static int
+vhd_print_batmap(vhd_context_t *vhd)
+{
+ int err;
+ size_t size;
+
+ err = vhd_get_batmap(vhd);
+ if (err) {
+ printf("failed to read batmap: %d\n", err);
+ return err;
+ }
+
+ size = vhd_sectors_to_bytes(vhd->batmap.header.batmap_size);
+ write_full(STDOUT_FILENO, vhd->batmap.map, size);
+
+ return 0;
+}
+
+static int
+vhd_test_batmap(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+ int i, err;
+ uint64_t cur;
+
+ if (check_block_range(vhd, block + count, hex))
+ return -ERANGE;
+
+ err = vhd_get_batmap(vhd);
+ if (err) {
+ fprintf(stderr, "failed to get batmap\n");
+ return err;
+ }
+
+ for (i = 0; i < count; i++) {
+ cur = block + i;
+ fprintf(stderr, "batmap for block %s: %d\n", conv(hex, cur),
+ vhd_batmap_test(vhd, &vhd->batmap, cur));
+ }
+
+ return 0;
+}
+
+static int
+vhd_print_data(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+ char *buf;
+ int i, err;
+ uint64_t cur;
+
+ err = 0;
+
+ if (check_block_range(vhd, block + count, hex))
+ return -ERANGE;
+
+ for (i = 0; i < count; i++) {
+ cur = block + i;
+
+ if (vhd->bat.bat[cur] == DD_BLK_UNUSED) {
+ printf("block %s not allocated\n", conv(hex, cur));
+ continue;
+ }
+
+ err = vhd_read_block(vhd, cur, &buf);
+ if (err)
+ break;
+
+ write_full(STDOUT_FILENO, buf, vhd->header.block_size);
+ free(buf);
+ }
+
+ return err;
+}
+
+static int
+vhd_read_data(vhd_context_t *vhd, uint64_t sec, int count, int hex)
+{
+ char *buf;
+ uint64_t cur;
+ int err, max, secs;
+
+ if (vhd_sectors_to_bytes(sec + count) > vhd->footer.curr_size)
+ return -ERANGE;
+
+ max = MIN(vhd_sectors_to_bytes(count), VHD_BLOCK_SIZE);
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, max);
+ if (err)
+ return -err;
+
+ cur = sec;
+ while (count) {
+ secs = MIN((max >> VHD_SECTOR_SHIFT), count);
+ err = vhd_io_read(vhd, buf, cur, secs);
+ if (err)
+ break;
+
+ write_full(STDOUT_FILENO, buf, vhd_sectors_to_bytes(secs));
+
+ cur += secs;
+ count -= secs;
+ }
+
+ free(buf);
+ return err;
+}
+
+int
+vhd_util_read(int argc, char **argv)
+{
+ char *name;
+ vhd_context_t vhd;
+ int c, err, headers, hex;
+ uint64_t bat, bitmap, tbitmap, batmap, tbatmap, data, lsec, count, read;
+
+ err = 0;
+ hex = 0;
+ headers = 0;
+ count = 1;
+ bat = -1;
+ bitmap = -1;
+ tbitmap = -1;
+ batmap = -1;
+ tbatmap = -1;
+ data = -1;
+ lsec = -1;
+ read = -1;
+ name = NULL;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:pt:b:m:i:aj:d:c:r:xh")) != -1) {
+ switch(c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'p':
+ headers = 1;
+ break;
+ case 't':
+ lsec = strtoul(optarg, NULL, 10);
+ break;
+ case 'b':
+ bat = strtoull(optarg, NULL, 10);
+ break;
+ case 'm':
+ bitmap = strtoull(optarg, NULL, 10);
+ break;
+ case 'i':
+ tbitmap = strtoul(optarg, NULL, 10);
+ break;
+ case 'a':
+ batmap = 1;
+ break;
+ case 'j':
+ tbatmap = strtoull(optarg, NULL, 10);
+ break;
+ case 'd':
+ data = strtoull(optarg, NULL, 10);
+ break;
+ case 'r':
+ read = strtoull(optarg, NULL, 10);
+ break;
+ case 'c':
+ count = strtoul(optarg, NULL, 10);
+ break;
+ case 'x':
+ hex = 1;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc)
+ goto usage;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+ if (err) {
+ printf("Failed to open %s: %d\n", name, err);
+ vhd_dump_headers(name, hex);
+ return err;
+ }
+
+ err = vhd_get_bat(&vhd);
+ if (err) {
+ printf("Failed to get bat for %s: %d\n", name, err);
+ goto out;
+ }
+
+ if (headers)
+ vhd_print_headers(&vhd, hex);
+
+ if (lsec != -1) {
+ err = vhd_print_logical_to_physical(&vhd, lsec, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (bat != -1) {
+ err = vhd_print_bat(&vhd, bat, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (bitmap != -1) {
+ err = vhd_print_bitmap(&vhd, bitmap, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (tbitmap != -1) {
+ err = vhd_test_bitmap(&vhd, tbitmap, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (batmap != -1) {
+ err = vhd_print_batmap(&vhd);
+ if (err)
+ goto out;
+ }
+
+ if (tbatmap != -1) {
+ err = vhd_test_batmap(&vhd, tbatmap, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (data != -1) {
+ err = vhd_print_data(&vhd, data, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (read != -1) {
+ err = vhd_read_data(&vhd, read, count, hex);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+
+ out:
+ vhd_close(&vhd);
+ return err;
+
+ usage:
+ printf("options:\n"
+ "-h help\n"
+ "-n name\n"
+ "-p print VHD headers\n"
+ "-t sec translate logical sector to VHD location\n"
+ "-b blk print bat entry\n"
+ "-m blk print bitmap\n"
+ "-i sec test bitmap for logical sector\n"
+ "-a print batmap\n"
+ "-j blk test batmap for block\n"
+ "-d blk print data\n"
+ "-c num num units\n"
+ "-r sec read num sectors at sec\n"
+ "-x print in hex\n");
+ return EINVAL;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_repair(int argc, char **argv)
+{
+ char *name;
+ int err, c;
+ off_t eof;
+ vhd_context_t vhd;
+
+ name = NULL;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc)
+ goto usage;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ err = vhd_end_of_data(&vhd, &eof);
+ if (err) {
+ printf("error finding end of data: %d\n", err);
+ goto done;
+ }
+
+ err = vhd_write_footer_at(&vhd, &vhd.footer, eof);
+
+ done:
+ vhd_close(&vhd);
+ return err;
+
+usage:
+ printf("options: <-n name> [-h help]\n");
+ return -EINVAL;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+
+#include "libvhd-journal.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf(stdout, _f, ##_a)
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define EPRINTF(_f, _a...) \
+ do { \
+ syslog(LOG_INFO, "%s: " _f, __func__, ##_a); \
+ DFPRINTF(_f, _a); \
+ } while (0)
+
+typedef struct vhd_block {
+ uint32_t block;
+ uint32_t offset;
+} vhd_block_t;
+
+TEST_FAIL_EXTERN_VARS;
+
+static inline uint32_t
+secs_to_blocks_down(vhd_context_t *vhd, uint64_t secs)
+{
+ return secs / vhd->spb;
+}
+
+static uint32_t
+secs_to_blocks_up(vhd_context_t *vhd, uint64_t secs)
+{
+ uint32_t blocks;
+
+ blocks = secs / vhd->spb;
+ if (secs % vhd->spb)
+ blocks++;
+
+ return blocks;
+}
+
+static int
+vhd_fixed_shrink(vhd_journal_t *journal, uint64_t secs)
+{
+ int err;
+ uint64_t new_eof;
+ vhd_context_t *vhd;
+
+ vhd = &journal->vhd;
+
+ new_eof = vhd->footer.curr_size - vhd_sectors_to_bytes(secs);
+ if (new_eof <= sizeof(vhd_footer_t))
+ return -EINVAL;
+
+ err = ftruncate(vhd->fd, new_eof);
+ if (err)
+ return errno;
+
+ vhd->footer.curr_size = new_eof;
+ return vhd_write_footer(vhd, &vhd->footer);
+}
+
+static int
+vhd_write_zeros(vhd_journal_t *journal, off_t off, uint64_t size)
+{
+ int err;
+ char *buf;
+ vhd_context_t *vhd;
+ uint64_t bytes, map;
+
+ vhd = &journal->vhd;
+ map = MIN(size, VHD_BLOCK_SIZE);
+
+ err = vhd_seek(vhd, off, SEEK_SET);
+ if (err)
+ return err;
+
+ buf = mmap(0, map, PROT_READ, MAP_SHARED | MAP_ANON, -1, 0);
+ if (buf == MAP_FAILED)
+ return -errno;
+
+ do {
+ bytes = MIN(size, map);
+
+ err = vhd_write(vhd, buf, bytes);
+ if (err)
+ break;
+
+ size -= bytes;
+ } while (size);
+
+ munmap(buf, map);
+
+ return err;
+}
+
+static int
+vhd_fixed_grow(vhd_journal_t *journal, uint64_t secs)
+{
+ int err;
+ vhd_context_t *vhd;
+ uint64_t size, eof, new_eof;
+
+ size = vhd_sectors_to_bytes(secs);
+ vhd = &journal->vhd;
+
+ err = vhd_seek(vhd, 0, SEEK_END);
+ if (err)
+ goto out;
+
+ eof = vhd_position(vhd);
+ if (eof == (off_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ err = vhd_write_zeros(journal, eof - sizeof(vhd_footer_t), size);
+ if (err)
+ goto out;
+
+ new_eof = eof + size;
+ err = vhd_seek(vhd, new_eof, SEEK_SET);
+ if (err)
+ goto out;
+
+ vhd->footer.curr_size += size;
+ err = vhd_write_footer(vhd, &vhd->footer);
+ if (err)
+ goto out;
+
+ err = 0;
+
+out:
+ return err;
+}
+
+static int
+vhd_fixed_resize(vhd_journal_t *journal, uint64_t size)
+{
+ int err;
+ vhd_context_t *vhd;
+ uint64_t cur_secs, new_secs;
+
+ vhd = &journal->vhd;
+ cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT;
+ new_secs = size << (20 - VHD_SECTOR_SHIFT);
+
+ if (cur_secs == new_secs)
+ return 0;
+ else if (cur_secs > new_secs)
+ err = vhd_fixed_shrink(journal, cur_secs - new_secs);
+ else
+ err = vhd_fixed_grow(journal, new_secs - cur_secs);
+
+ return err;
+}
+
+static inline void
+swap(vhd_block_t *list, int a, int b)
+{
+ vhd_block_t tmp;
+
+ tmp = list[a];
+ list[a] = list[b];
+ list[b] = tmp;
+}
+
+static int
+partition(vhd_block_t *list, int left, int right, int pidx)
+{
+ int i, sidx;
+ long long pval;
+
+ sidx = left;
+ pval = list[pidx].offset;
+ swap(list, pidx, right);
+
+ for (i = left; i < right; i++)
+ if (list[i].offset >= pval) {
+ swap(list, sidx, i);
+ ++sidx;
+ }
+
+ swap(list, right, sidx);
+ return sidx;
+}
+
+static void
+quicksort(vhd_block_t *list, int left, int right)
+{
+ int pidx, new_pidx;
+
+ if (right < left)
+ return;
+
+ pidx = left;
+ new_pidx = partition(list, left, right, pidx);
+ quicksort(list, left, new_pidx - 1);
+ quicksort(list, new_pidx + 1, right);
+}
+
+static int
+vhd_move_block(vhd_journal_t *journal, uint32_t src, off_t offset)
+{
+ int err;
+ char *buf;
+ size_t size;
+ vhd_context_t *vhd;
+ off_t off, src_off;
+
+ buf = NULL;
+ vhd = &journal->vhd;
+ off = offset;
+ size = vhd_sectors_to_bytes(vhd->bm_secs);
+ src_off = vhd->bat.bat[src];
+
+ if (src_off == DD_BLK_UNUSED)
+ return -EINVAL;
+ src_off = vhd_sectors_to_bytes(src_off);
+
+ err = vhd_journal_add_block(journal, src,
+ VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA);
+ if (err)
+ goto out;
+
+ err = vhd_read_bitmap(vhd, src, &buf);
+ if (err)
+ goto out;
+
+ err = vhd_seek(vhd, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = vhd_write(vhd, buf, size);
+ if (err)
+ goto out;
+
+ free(buf);
+ buf = NULL;
+ off += size;
+ size = vhd_sectors_to_bytes(vhd->spb);
+
+ err = vhd_read_block(vhd, src, &buf);
+ if (err)
+ goto out;
+
+ err = vhd_seek(vhd, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = vhd_write(vhd, buf, size);
+ if (err)
+ goto out;
+
+ vhd->bat.bat[src] = offset >> VHD_SECTOR_SHIFT;
+
+ err = vhd_write_zeros(journal, src_off,
+ vhd_sectors_to_bytes(vhd->bm_secs + vhd->spb));
+
+out:
+ free(buf);
+ return err;
+}
+
+static int
+vhd_clobber_block(vhd_journal_t *journal, uint32_t src, uint32_t dest)
+{
+ int err;
+ off_t off;
+ vhd_context_t *vhd;
+
+ vhd = &journal->vhd;
+ off = vhd_sectors_to_bytes(vhd->bat.bat[dest]);
+
+ err = vhd_journal_add_block(journal, dest,
+ VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA);
+ if (err)
+ return err;
+
+ err = vhd_move_block(journal, src, off);
+ if (err)
+ return err;
+
+ vhd->bat.bat[dest] = DD_BLK_UNUSED;
+
+ return 0;
+}
+
+/*
+ * remove a list of blocks from the vhd file
+ * if a block to be removed:
+ * - resides at the end of the file: simply clear its bat entry
+ * - resides elsewhere: move the last block in the file into its position
+ * and update the bat to reflect this
+ */
+static int
+vhd_defrag_shrink(vhd_journal_t *journal,
+ vhd_block_t *original_free_list, int free_cnt)
+{
+ vhd_context_t *vhd;
+ int i, j, free_idx, err;
+ vhd_block_t *blocks, *free_list;
+
+ err = 0;
+ blocks = NULL;
+ free_list = NULL;
+ vhd = &journal->vhd;
+
+ blocks = malloc(vhd->bat.entries * sizeof(vhd_block_t));
+ if (!blocks) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ free_list = malloc(free_cnt * sizeof(vhd_block_t));
+ if (!free_list) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < vhd->bat.entries; i++) {
+ blocks[i].block = i;
+ blocks[i].offset = vhd->bat.bat[i];
+ }
+
+ memcpy(free_list, original_free_list,
+ free_cnt * sizeof(vhd_block_t));
+
+ /* sort both the to-free list and the bat list
+ * in order of descending file offset */
+ quicksort(free_list, 0, free_cnt - 1);
+ quicksort(blocks, 0, vhd->bat.entries - 1);
+
+ for (i = 0, free_idx = 0;
+ i < vhd->bat.entries && free_idx < free_cnt; i++) {
+ vhd_block_t *b = blocks + i;
+
+ if (b->offset == DD_BLK_UNUSED)
+ continue;
+
+ for (j = free_idx; j < free_cnt; j++)
+ if (b->block == free_list[j].block) {
+ /* the last block in the file is in the list of
+ * blocks to remove; no need to shuffle the
+ * data -- just clear the bat entry */
+ vhd->bat.bat[free_list[j].block] = DD_BLK_UNUSED;
+ free_idx++;
+ continue;
+ }
+
+ err = vhd_clobber_block(journal, b->block,
+ free_list[free_idx++].block);
+ if (err)
+ goto out;
+ }
+
+ /* clear any bat entries for blocks we did not shuffle */
+ for (i = free_idx; i < free_cnt; i++)
+ vhd->bat.bat[free_list[i].block] = DD_BLK_UNUSED;
+
+out:
+ free(blocks);
+ free(free_list);
+
+ return err;
+}
+
+static int
+vhd_clear_bat_entries(vhd_journal_t *journal, uint32_t entries)
+{
+ int i, err;
+ vhd_context_t *vhd;
+ off_t orig_map_off, new_map_off;
+ uint32_t orig_entries, new_entries;
+
+ vhd = &journal->vhd;
+ orig_entries = vhd->header.max_bat_size;
+ new_entries = orig_entries - entries;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_batmap_header_offset(vhd, &orig_map_off);
+ if (err)
+ return err;
+ }
+
+ /* update header */
+ vhd->header.max_bat_size = new_entries;
+ err = vhd_write_header(vhd, &vhd->header);
+ if (err)
+ return err;
+
+ /* update footer */
+ vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size;
+ vhd->footer.geometry = vhd_chs(vhd->footer.curr_size);
+ err = vhd_write_footer(vhd, &vhd->footer);
+ if (err)
+ return err;
+
+ /* update bat -- we don't reclaim space, just clear entries */
+ for (i = new_entries; i < orig_entries; i++)
+ vhd->bat.bat[i] = 0;
+
+ err = vhd_write_bat(vhd, &vhd->bat);
+ if (err)
+ return err;
+
+ /* update this after write_bat so the end of the bat is zeored */
+ vhd->bat.entries = new_entries;
+
+ if (!vhd_has_batmap(vhd))
+ return 0;
+
+ /* zero out old batmap header if new header has moved */
+ err = vhd_batmap_header_offset(vhd, &new_map_off);
+ if (err)
+ return err;
+
+ if (orig_map_off != new_map_off) {
+ size_t size;
+
+ size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+ err = vhd_write_zeros(journal, orig_map_off, size);
+ if (err)
+ return err;
+ }
+
+ /* update batmap -- clear entries for freed blocks */
+ for (i = new_entries; i < orig_entries; i++)
+ vhd_batmap_clear(vhd, &vhd->batmap, i);
+
+ err = vhd_write_batmap(vhd, &vhd->batmap);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int
+vhd_dynamic_shrink(vhd_journal_t *journal, uint64_t secs)
+{
+ off_t eof;
+ uint32_t blocks;
+ vhd_context_t *vhd;
+ int i, j, err, free_cnt;
+ struct vhd_block *free_list;
+
+ printf("dynamic shrink not fully implemented\n");
+ return -ENOSYS;
+
+ eof = 0;
+ free_cnt = 0;
+ free_list = NULL;
+ vhd = &journal->vhd;
+
+ blocks = secs_to_blocks_down(vhd, secs);
+ if (blocks == 0)
+ return 0;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_get_batmap(vhd);
+ if (err)
+ return err;
+ }
+
+ free_list = malloc(blocks * sizeof(struct vhd_block));
+ if (!free_list)
+ return -ENOMEM;
+
+ for (i = vhd->bat.entries - 1, j = 0; i >= 0 && j < blocks; i--, j++) {
+ uint32_t blk = vhd->bat.bat[i];
+
+ if (blk != DD_BLK_UNUSED) {
+ free_list[free_cnt].block = i;
+ free_list[free_cnt].offset = blk;
+ free_cnt++;
+ }
+ }
+
+ if (free_cnt) {
+ err = vhd_defrag_shrink(journal, free_list, free_cnt);
+ if (err)
+ goto out;
+ }
+
+ err = vhd_clear_bat_entries(journal, blocks);
+ if (err)
+ goto out;
+
+ /* remove data beyond footer */
+ err = vhd_end_of_data(vhd, &eof);
+ if (err)
+ goto out;
+
+ err = ftruncate(vhd->fd, eof + sizeof(vhd_footer_t));
+ if (err) {
+ err = -errno;
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ free(free_list);
+ return err;
+}
+
+static inline void
+vhd_first_data_block(vhd_context_t *vhd, vhd_block_t *block)
+{
+ int i;
+ uint32_t blk;
+
+ memset(block, 0, sizeof(vhd_block_t));
+
+ for (i = 0; i < vhd->bat.entries; i++) {
+ blk = vhd->bat.bat[i];
+
+ if (blk != DD_BLK_UNUSED) {
+ if (!block->offset || blk < block->offset) {
+ block->block = i;
+ block->offset = blk;
+ }
+ }
+ }
+}
+
+static inline uint32_t
+vhd_next_block_offset(vhd_context_t *vhd)
+{
+ int i;
+ uint32_t blk, end, spp, next;
+
+ next = 0;
+ spp = getpagesize() >> VHD_SECTOR_SHIFT;
+
+ for (i = 0; i < vhd->bat.entries; i++) {
+ blk = vhd->bat.bat[i];
+
+ if (blk != DD_BLK_UNUSED) {
+ end = blk + vhd->spb + vhd->bm_secs;
+ next = MAX(next, end);
+ }
+ }
+
+ return next;
+}
+
+static inline int
+in_range(off_t off, off_t start, off_t size)
+{
+ return (start < off && start + size > off);
+}
+
+#define SKIP_HEADER 0x01
+#define SKIP_BAT 0x02
+#define SKIP_BATMAP 0x04
+#define SKIP_PLOC 0x08
+#define SKIP_DATA 0x10
+
+static inline int
+skip_check(int mode, int type)
+{
+ return mode & type;
+}
+
+static int
+vhd_check_for_clobber(vhd_context_t *vhd, off_t off, int mode)
+{
+ int i, n;
+ char *msg;
+ size_t size;
+ vhd_block_t fb;
+ vhd_parent_locator_t *loc;
+
+ msg = NULL;
+
+ if (!vhd_type_dynamic(vhd))
+ return 0;
+
+ if (off < VHD_SECTOR_SIZE) {
+ msg = "backup footer";
+ goto fail;
+ }
+
+ if (!skip_check(mode, SKIP_HEADER))
+ if (in_range(off,
+ vhd->footer.data_offset, sizeof(vhd_header_t))) {
+ msg = "header";
+ goto fail;
+ }
+
+ if (!skip_check(mode, SKIP_BAT))
+ if (in_range(off, vhd->header.table_offset,
+ vhd_bytes_padded(vhd->header.max_bat_size *
+ sizeof(uint32_t)))) {
+ msg = "bat";
+ goto fail;
+ }
+
+ if (!skip_check(mode, SKIP_BATMAP))
+ if (vhd_has_batmap(vhd) &&
+ in_range(off, vhd->batmap.header.batmap_offset,
+ vhd_bytes_padded(vhd->batmap.header.batmap_size))) {
+ msg = "batmap";
+ goto fail;
+ }
+
+ if (!skip_check(mode, SKIP_PLOC)) {
+ n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+ for (i = 0; i < n; i++) {
+ loc = vhd->header.loc + i;
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ size = vhd_parent_locator_size(loc);
+ if (in_range(off, loc->data_offset, size)) {
+ msg = "parent locator";
+ goto fail;
+ }
+ }
+ }
+
+ if (!skip_check(mode, SKIP_DATA)) {
+ vhd_first_data_block(vhd, &fb);
+ if (fb.offset && in_range(off,
+ vhd_sectors_to_bytes(fb.offset),
+ VHD_BLOCK_SIZE)) {
+ msg = "data block";
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ EPRINTF("write to 0x%08"PRIx64" would clobber %s\n", off, msg);
+ return -EINVAL;
+}
+
+/*
+ * take any metadata after the bat (@eob) and shift it
+ */
+static int
+vhd_shift_metadata(vhd_journal_t *journal, off_t eob,
+ size_t bat_needed, size_t map_needed)
+{
+ int i, n, err;
+ vhd_context_t *vhd;
+ size_t size_needed;
+ char *buf, **locators;
+ vhd_parent_locator_t *loc;
+
+ vhd = &journal->vhd;
+ size_needed = bat_needed + map_needed;
+
+ n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+
+ locators = calloc(n, sizeof(char *));
+ if (!locators)
+ return -ENOMEM;
+
+ for (i = 0; i < n; i++) {
+ size_t size;
+
+ loc = vhd->header.loc + i;
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ if (loc->data_offset < eob)
+ continue;
+
+ size = vhd_parent_locator_size(loc);
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ err = -err;
+ buf = NULL;
+ goto out;
+ }
+
+ err = vhd_seek(vhd, loc->data_offset, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = vhd_read(vhd, buf, size);
+ if (err)
+ goto out;
+
+ locators[i] = buf;
+ }
+
+ for (i = 0; i < n; i++) {
+ off_t off;
+ size_t size;
+
+ if (!locators[i])
+ continue;
+
+ loc = vhd->header.loc + i;
+ off = loc->data_offset + size_needed;
+ size = vhd_parent_locator_size(loc);
+
+ if (vhd_check_for_clobber(vhd, off + size, SKIP_PLOC)) {
+ EPRINTF("%s: shifting locator %d would clobber data\n",
+ vhd->file, i);
+ return -EINVAL;
+ }
+
+ err = vhd_seek(vhd, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = vhd_write(vhd, locators[i], size);
+ if (err)
+ goto out;
+
+ free(locators[i]);
+ locators[i] = NULL;
+ loc->data_offset = off;
+
+ /* write the new header after writing the new bat */
+ }
+
+ if (vhd_has_batmap(vhd) && vhd->batmap.header.batmap_offset > eob) {
+ vhd->batmap.header.batmap_offset += bat_needed;
+
+ /* write the new batmap after writing the new bat */
+ }
+
+ err = 0;
+
+out:
+ for (i = 0; i < n; i++)
+ free(locators[i]);
+ free(locators);
+
+ return err;
+}
+
+static int
+vhd_add_bat_entries(vhd_journal_t *journal, int entries)
+{
+ int i, err;
+ off_t off;
+ vhd_bat_t new_bat;
+ vhd_context_t *vhd;
+ uint32_t new_entries;
+ vhd_batmap_t new_batmap;
+ uint64_t bat_size, new_bat_size, map_size, new_map_size;
+
+ vhd = &journal->vhd;
+ new_entries = vhd->header.max_bat_size + entries;
+
+ bat_size = vhd_bytes_padded(vhd->header.max_bat_size *
+ sizeof(uint32_t));
+ new_bat_size = vhd_bytes_padded(new_entries * sizeof(uint32_t));
+
+ map_size = vhd_bytes_padded((vhd->header.max_bat_size + 7) >> 3);
+ new_map_size = vhd_bytes_padded((new_entries + 7) >> 3);
+
+ off = vhd->header.table_offset + new_bat_size;
+ if (vhd_check_for_clobber(vhd, off, SKIP_BAT | SKIP_BATMAP)) {
+ EPRINTF("%s: writing new bat of 0x%"PRIx64" bytes "
+ "at 0x%08"PRIx64" would clobber data\n",
+ vhd->file, new_bat_size, vhd->header.table_offset);
+ return -EINVAL;
+ }
+
+ if (vhd_has_batmap(vhd)) {
+ off = vhd->batmap.header.batmap_offset + new_map_size;
+ if (vhd_check_for_clobber(vhd, off, 0)) {
+ EPRINTF("%s: writing new batmap of 0x%"PRIx64" bytes"
+ " at 0x%08"PRIx64" would clobber data\n", vhd->file,
+ new_map_size, vhd->batmap.header.batmap_offset);
+ return -EINVAL;
+ }
+ }
+
+ /* update header */
+ vhd->header.max_bat_size = new_entries;
+ err = vhd_write_header(vhd, &vhd->header);
+ if (err)
+ return err;
+
+ /* update footer */
+ vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size;
+ vhd->footer.geometry = vhd_chs(vhd->footer.curr_size);
+ vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+ err = vhd_write_footer(vhd, &vhd->footer);
+ if (err)
+ return err;
+
+ /* allocate new bat */
+ err = posix_memalign((void **)&new_bat.bat, VHD_SECTOR_SIZE, new_bat_size);
+ if (err)
+ return -err;
+
+ new_bat.spb = vhd->bat.spb;
+ new_bat.entries = new_entries;
+ memcpy(new_bat.bat, vhd->bat.bat, bat_size);
+ for (i = vhd->bat.entries; i < new_entries; i++)
+ new_bat.bat[i] = DD_BLK_UNUSED;
+
+ /* write new bat */
+ err = vhd_write_bat(vhd, &new_bat);
+ if (err) {
+ free(new_bat.bat);
+ return err;
+ }
+
+ /* update in-memory bat */
+ free(vhd->bat.bat);
+ vhd->bat = new_bat;
+
+ if (!vhd_has_batmap(vhd))
+ return 0;
+
+ /* allocate new batmap */
+ err = posix_memalign((void **)&new_batmap.map,
+ VHD_SECTOR_SIZE, new_map_size);
+ if (err)
+ return err;
+
+ new_batmap.header = vhd->batmap.header;
+ new_batmap.header.batmap_size = secs_round_up_no_zero(new_map_size);
+ memcpy(new_batmap.map, vhd->batmap.map, map_size);
+ memset(new_batmap.map + map_size, 0, new_map_size - map_size);
+
+ /* write new batmap */
+ err = vhd_write_batmap(vhd, &new_batmap);
+ if (err) {
+ free(new_batmap.map);
+ return err;
+ }
+
+ /* update in-memory batmap */
+ free(vhd->batmap.map);
+ vhd->batmap = new_batmap;
+
+ return 0;
+}
+
+static int
+vhd_dynamic_grow(vhd_journal_t *journal, uint64_t secs)
+{
+ int i, err;
+ off_t eob, eom;
+ vhd_context_t *vhd;
+ vhd_block_t first_block;
+ uint64_t blocks, size_needed;
+ uint64_t bat_needed, bat_size, bat_avail, bat_bytes, bat_secs;
+ uint64_t map_needed, map_size, map_avail, map_bytes, map_secs;
+
+ vhd = &journal->vhd;
+
+ size_needed = 0;
+ bat_needed = 0;
+ map_needed = 0;
+
+ /* number of vhd blocks to add */
+ blocks = secs_to_blocks_up(vhd, secs);
+
+ /* size in bytes needed for new bat entries */
+ bat_needed = blocks * sizeof(uint32_t);
+ map_needed = (blocks >> 3) + 1;
+
+ /* available bytes in current bat */
+ bat_bytes = vhd->header.max_bat_size * sizeof(uint32_t);
+ bat_secs = secs_round_up_no_zero(bat_bytes);
+ bat_size = vhd_sectors_to_bytes(bat_secs);
+ bat_avail = bat_size - bat_bytes;
+
+ if (vhd_has_batmap(vhd)) {
+ /* avaliable bytes in current batmap */
+ map_bytes = (vhd->header.max_bat_size + 7) >> 3;
+ map_secs = vhd->batmap.header.batmap_size;
+ map_size = vhd_sectors_to_bytes(map_secs);
+ map_avail = map_size - map_bytes;
+ } else {
+ map_needed = 0;
+ map_avail = 0;
+ }
+
+ /* we have enough space already; just extend the bat */
+ if (bat_needed <= bat_avail && map_needed <= map_avail)
+ goto add_entries;
+
+ /* we need to add new sectors to the bat */
+ if (bat_needed > bat_avail) {
+ bat_needed -= bat_avail;
+ bat_needed = vhd_bytes_padded(bat_needed);
+ } else
+ bat_needed = 0;
+
+ /* we need to add new sectors to the batmap */
+ if (map_needed > map_avail) {
+ map_needed -= map_avail;
+ map_needed = vhd_bytes_padded(map_needed);
+ } else
+ map_needed = 0;
+
+ /* how many additional bytes do we need? */
+ size_needed = bat_needed + map_needed;
+
+ /* calculate space between end of headers and beginning of data */
+ err = vhd_end_of_headers(vhd, &eom);
+ if (err)
+ return err;
+
+ eob = vhd->header.table_offset + vhd_sectors_to_bytes(bat_secs);
+ vhd_first_data_block(vhd, &first_block);
+
+ /* no blocks allocated; just shift post-bat metadata */
+ if (!first_block.offset)
+ goto shift_metadata;
+
+ /*
+ * not enough space --
+ * move vhd data blocks to the end of the file to make room
+ */
+ do {
+ off_t new_off, bm_size, gap_size;
+
+ new_off = vhd_sectors_to_bytes(vhd_next_block_offset(vhd));
+
+ /* data region of segment should begin on page boundary */
+ bm_size = vhd_sectors_to_bytes(vhd->bm_secs);
+ if ((new_off + bm_size) % 4096) {
+ gap_size = 4096 - ((new_off + bm_size) % 4096);
+
+ err = vhd_write_zeros(journal, new_off, gap_size);
+ if (err)
+ return err;
+
+ new_off += gap_size;
+ }
+
+ err = vhd_move_block(journal, first_block.block, new_off);
+ if (err)
+ return err;
+
+ vhd_first_data_block(vhd, &first_block);
+
+ } while (eom + size_needed >= vhd_sectors_to_bytes(first_block.offset));
+
+ TEST_FAIL_AT(FAIL_RESIZE_DATA_MOVED);
+
+shift_metadata:
+ /* shift any metadata after the bat to make room for new bat sectors */
+ err = vhd_shift_metadata(journal, eob, bat_needed, map_needed);
+ if (err)
+ return err;
+
+ TEST_FAIL_AT(FAIL_RESIZE_METADATA_MOVED);
+
+add_entries:
+ return vhd_add_bat_entries(journal, blocks);
+}
+
+static int
+vhd_dynamic_resize(vhd_journal_t *journal, uint64_t size)
+{
+ int err;
+ vhd_context_t *vhd;
+ uint64_t cur_secs, new_secs;
+
+ vhd = &journal->vhd;
+ cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT;
+ new_secs = size << (20 - VHD_SECTOR_SHIFT);
+
+ if (cur_secs == new_secs)
+ return 0;
+
+ err = vhd_get_header(vhd);
+ if (err)
+ return err;
+
+ err = vhd_get_bat(vhd);
+ if (err)
+ return err;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_get_batmap(vhd);
+ if (err)
+ return err;
+ }
+
+ if (cur_secs > new_secs)
+ err = vhd_dynamic_shrink(journal, cur_secs - new_secs);
+ else
+ err = vhd_dynamic_grow(journal, new_secs - cur_secs);
+
+ return err;
+}
+
+static int
+vhd_util_resize_check_creator(const char *name)
+{
+ int err;
+ vhd_context_t vhd;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_STRICT);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ if (!vhd_creator_tapdisk(&vhd)) {
+ printf("%s not created by xen; resize not supported\n", name);
+ err = -EINVAL;
+ }
+
+ vhd_close(&vhd);
+ return err;
+}
+
+int
+vhd_util_resize(int argc, char **argv)
+{
+ char *name, *jname;
+ uint64_t size;
+ int c, err, jerr;
+ vhd_journal_t journal;
+ vhd_context_t *vhd;
+
+ err = -EINVAL;
+ size = 0;
+ name = NULL;
+ jname = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:j:s:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'j':
+ jname = optarg;
+ break;
+ case 's':
+ err = 0;
+ size = strtoull(optarg, NULL, 10);
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (err || !name || !jname || argc != optind)
+ goto usage;
+
+ err = vhd_util_resize_check_creator(name);
+ if (err)
+ return err;
+
+ libvhd_set_log_level(1);
+ err = vhd_journal_create(&journal, name, jname);
+ if (err) {
+ printf("creating journal failed: %d\n", err);
+ return err;
+ }
+
+ vhd = &journal.vhd;
+
+ err = vhd_get_footer(vhd);
+ if (err)
+ goto out;
+
+ TEST_FAIL_AT(FAIL_RESIZE_BEGIN);
+
+ if (vhd_type_dynamic(vhd))
+ err = vhd_dynamic_resize(&journal, size);
+ else
+ err = vhd_fixed_resize(&journal, size);
+
+ TEST_FAIL_AT(FAIL_RESIZE_END);
+
+out:
+ if (err) {
+ printf("resize failed: %d\n", err);
+ jerr = vhd_journal_revert(&journal);
+ } else
+ jerr = vhd_journal_commit(&journal);
+
+ if (jerr) {
+ printf("closing journal failed: %d\n", jerr);
+ vhd_journal_close(&journal);
+ } else
+ vhd_journal_remove(&journal);
+
+ return (err ? : jerr);
+
+usage:
+ printf("options: <-n name> <-j journal> <-s size (in MB)> [-h help]\n");
+ return -EINVAL;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Altering operations:
+ *
+ * 1. Change the parent pointer to another file.
+ * 2. Change the size of the file containing the VHD image. This does NOT
+ * affect the VHD disk capacity, only the physical size of the file containing
+ * the VHD. Naturally, it is not possible to set the file size to be less than
+ * the what VHD utilizes.
+ * The operation doesn't actually change the file size, but it writes the
+ * footer in the right location such that resizing the file (manually, as a
+ * separate step) will produce the correct results. If the new file size is
+ * greater than the current file size, the file must first be expanded and then
+ * altered with this operation. If the new size is smaller than the current
+ * size, the VHD must first be altered with this operation and then the file
+ * must be shrunk. Failing to resize the file will result in a corrupted VHD.
+*/
+
+#include <errno.h>
+//#include <fcntl.h>
+#include <stdio.h>
+//#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+#include "libvhd-journal.h"
+
+int
+vhd_util_revert(int argc, char **argv)
+{
+ char *name, *jname;
+ vhd_journal_t journal;
+ int c, err;
+
+ name = NULL;
+ jname = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:j:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'j':
+ jname = optarg;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || !jname || argc != optind)
+ goto usage;
+
+ libvhd_set_log_level(1);
+ err = vhd_journal_open(&journal, name, jname);
+ if (err) {
+ printf("opening journal failed: %d\n", err);
+ return err;
+ }
+
+ err = vhd_journal_revert(&journal);
+ if (err) {
+ printf("reverting journal failed: %d\n", err);
+ vhd_journal_close(&journal);
+ return err;
+ }
+
+ err = vhd_journal_remove(&journal);
+ if (err) {
+ printf("removing journal failed: %d\n", err);
+ vhd_journal_close(&journal);
+ return err;
+ }
+
+ return 0;
+
+usage:
+ printf("options: <-n name> <-j journal> [-h help]\n");
+ return -EINVAL;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <glob.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <libgen.h> /* for basename() */
+#include <sys/stat.h>
+
+#include "list.h"
+#include "libvhd.h"
+#include "lvm-util.h"
+
+#define VHD_SCAN_FAST 0x01
+#define VHD_SCAN_PRETTY 0x02
+#define VHD_SCAN_VOLUME 0x04
+#define VHD_SCAN_NOFAIL 0x08
+#define VHD_SCAN_VERBOSE 0x10
+#define VHD_SCAN_PARENTS 0x20
+
+#define VHD_TYPE_RAW_FILE 0x01
+#define VHD_TYPE_VHD_FILE 0x02
+#define VHD_TYPE_RAW_VOLUME 0x04
+#define VHD_TYPE_VHD_VOLUME 0x08
+
+static inline int
+target_volume(uint8_t type)
+{
+ return (type == VHD_TYPE_RAW_VOLUME || type == VHD_TYPE_VHD_VOLUME);
+}
+
+static inline int
+target_vhd(uint8_t type)
+{
+ return (type == VHD_TYPE_VHD_FILE || type == VHD_TYPE_VHD_VOLUME);
+}
+
+struct target {
+ char name[VHD_MAX_NAME_LEN];
+ char device[VHD_MAX_NAME_LEN];
+ uint64_t size;
+ uint64_t start;
+ uint64_t end;
+ uint8_t type;
+};
+
+struct iterator {
+ int cur;
+ int cur_size;
+ int max_size;
+ struct target *targets;
+};
+
+struct vhd_image {
+ char *name;
+ char *parent;
+ uint64_t capacity;
+ off_t size;
+ uint8_t hidden;
+ int error;
+ char *message;
+
+ struct target *target;
+
+ struct list_head sibling;
+ struct list_head children;
+ struct vhd_image *parent_image;
+};
+
+struct vhd_scan {
+ int cur;
+ int size;
+
+ int lists_cur;
+ int lists_size;
+
+ struct vhd_image **images;
+ struct vhd_image **lists;
+};
+
+static int flags;
+static struct vg vg;
+static struct vhd_scan scan;
+
+static int
+vhd_util_scan_pretty_allocate_list(int cnt)
+{
+ int i;
+ struct vhd_image *list;
+
+ memset(&scan, 0, sizeof(scan));
+
+ scan.lists_cur = 1;
+ scan.lists_size = 10;
+
+ scan.lists = calloc(scan.lists_size, sizeof(struct vhd_image *));
+ if (!scan.lists)
+ goto fail;
+
+ scan.lists[0] = calloc(cnt, sizeof(struct vhd_image));
+ if (!scan.lists[0])
+ goto fail;
+
+ scan.images = calloc(cnt, sizeof(struct vhd_image *));
+ if (!scan.images)
+ goto fail;
+
+ for (i = 0; i < cnt; i++)
+ scan.images[i] = scan.lists[0] + i;
+
+ scan.cur = 0;
+ scan.size = cnt;
+
+ return 0;
+
+fail:
+ if (scan.lists) {
+ free(scan.lists[0]);
+ free(scan.lists);
+ }
+
+ free(scan.images);
+ memset(&scan, 0, sizeof(scan));
+ return -ENOMEM;
+}
+
+static void
+vhd_util_scan_pretty_free_list(void)
+{
+ int i;
+
+ if (scan.lists) {
+ for (i = 0; i < scan.lists_cur; i++)
+ free(scan.lists[i]);
+ free(scan.lists);
+ }
+
+ free(scan.images);
+ memset(&scan, 0, sizeof(scan));
+}
+
+static int
+vhd_util_scan_pretty_add_image(struct vhd_image *image)
+{
+ int i;
+ struct vhd_image *img;
+
+ for (i = 0; i < scan.cur; i++) {
+ img = scan.images[i];
+ if (!strcmp(img->name, image->name))
+ return 0;
+ }
+
+ if (scan.cur >= scan.size) {
+ struct vhd_image *new, **list;
+
+ if (scan.lists_cur >= scan.lists_size) {
+ list = realloc(scan.lists, scan.lists_size * 2 *
+ sizeof(struct vhd_image *));
+ if (!list)
+ return -ENOMEM;
+
+ scan.lists_size *= 2;
+ scan.lists = list;
+ }
+
+ new = calloc(scan.size, sizeof(struct vhd_image));
+ if (!new)
+ return -ENOMEM;
+
+ scan.lists[scan.lists_cur++] = new;
+ scan.size *= 2;
+
+ list = realloc(scan.images, scan.size *
+ sizeof(struct vhd_image *));
+ if (!list)
+ return -ENOMEM;
+
+ scan.images = list;
+ for (i = 0; i + scan.cur < scan.size; i++)
+ scan.images[i + scan.cur] = new + i;
+ }
+
+ img = scan.images[scan.cur];
+ INIT_LIST_HEAD(&img->sibling);
+ INIT_LIST_HEAD(&img->children);
+
+ img->capacity = image->capacity;
+ img->size = image->size;
+ img->hidden = image->hidden;
+ img->error = image->error;
+ img->message = image->message;
+
+ img->name = strdup(image->name);
+ if (!img->name)
+ goto fail;
+
+ if (image->parent) {
+ img->parent = strdup(image->parent);
+ if (!img->parent)
+ goto fail;
+ }
+
+ scan.cur++;
+ return 0;
+
+fail:
+ free(img->name);
+ free(img->parent);
+ memset(img, 0, sizeof(*img));
+ return -ENOMEM;
+}
+
+static int
+vhd_util_scan_pretty_image_compare(const void *lhs, const void *rhs)
+{
+ struct vhd_image *l, *r;
+
+ l = *(struct vhd_image **)lhs;
+ r = *(struct vhd_image **)rhs;
+
+ return strcmp(l->name, r->name);
+}
+
+static void
+vhd_util_scan_print_image_indent(struct vhd_image *image, int tab)
+{
+ char *pad, *name, *pmsg, *parent;
+
+ pad = (tab ? " " : "");
+ name = image->name;
+ parent = (image->parent ? : "none");
+
+ if ((flags & VHD_SCAN_PRETTY) && image->parent && !image->parent_image)
+ pmsg = " (not found in scan)";
+ else
+ pmsg = "";
+
+ if (!(flags & VHD_SCAN_VERBOSE)) {
+ name = basename(image->name);
+ if (image->parent)
+ parent = basename(image->parent);
+ }
+
+ if (image->error)
+ printf("%*svhd=%s scan-error=%d error-message='%s'\n",
+ tab, pad, image->name, image->error, image->message);
+ else
+ printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u "
+ "parent=%s%s\n", tab, pad, name, image->capacity,
+ image->size, image->hidden, parent, pmsg);
+}
+
+static void
+vhd_util_scan_pretty_print_tree(struct vhd_image *image, int depth)
+{
+ struct vhd_image *img, *tmp;
+
+ vhd_util_scan_print_image_indent(image, depth * 3);
+
+ list_for_each_entry_safe(img, tmp, &image->children, sibling)
+ if (!img->hidden)
+ vhd_util_scan_pretty_print_tree(img, depth + 1);
+
+ list_for_each_entry_safe(img, tmp, &image->children, sibling)
+ if (img->hidden)
+ vhd_util_scan_pretty_print_tree(img, depth + 1);
+
+ free(image->name);
+ free(image->parent);
+
+ image->name = NULL;
+ image->parent = NULL;
+}
+
+static void
+vhd_util_scan_pretty_print_images(void)
+{
+ int i;
+ struct vhd_image *image, **parentp, *parent, *keyp, key;
+
+ qsort(scan.images, scan.cur, sizeof(scan.images[0]),
+ vhd_util_scan_pretty_image_compare);
+
+ for (i = 0; i < scan.cur; i++) {
+ image = scan.images[i];
+
+ if (!image->parent) {
+ image->parent_image = NULL;
+ continue;
+ }
+
+ memset(&key, 0, sizeof(key));
+ key.name = image->parent;
+ keyp = &key;
+
+ parentp = bsearch(&keyp, scan.images, scan.cur,
+ sizeof(scan.images[0]),
+ vhd_util_scan_pretty_image_compare);
+ if (!parentp) {
+ image->parent_image = NULL;
+ continue;
+ }
+
+ parent = *parentp;
+ image->parent_image = parent;
+ list_add_tail(&image->sibling, &parent->children);
+ }
+
+ for (i = 0; i < scan.cur; i++) {
+ image = scan.images[i];
+
+ if (image->parent_image || !image->hidden)
+ continue;
+
+ vhd_util_scan_pretty_print_tree(image, 0);
+ }
+
+ for (i = 0; i < scan.cur; i++) {
+ image = scan.images[i];
+
+ if (!image->name || image->parent_image)
+ continue;
+
+ vhd_util_scan_pretty_print_tree(image, 0);
+ }
+
+ for (i = 0; i < scan.cur; i++) {
+ image = scan.images[i];
+
+ if (!image->name)
+ continue;
+
+ vhd_util_scan_pretty_print_tree(image, 0);
+ }
+}
+
+static void
+vhd_util_scan_print_image(struct vhd_image *image)
+{
+ int err;
+
+ if (!image->error && (flags & VHD_SCAN_PRETTY)) {
+ err = vhd_util_scan_pretty_add_image(image);
+ if (!err)
+ return;
+
+ if (!image->error) {
+ image->error = err;
+ image->message = "allocating memory";
+ }
+ }
+
+ vhd_util_scan_print_image_indent(image, 0);
+}
+
+static int
+vhd_util_scan_error(const char *file, int err)
+{
+ struct vhd_image image;
+
+ memset(&image, 0, sizeof(image));
+ image.name = (char *)file;
+ image.error = err;
+ image.message = "failure scanning target";
+
+ vhd_util_scan_print_image(&image);
+
+ /*
+ if (flags & VHD_SCAN_NOFAIL)
+ return 0;
+ */
+
+ return err;
+}
+
+static vhd_parent_locator_t *
+vhd_util_scan_get_parent_locator(vhd_context_t *vhd)
+{
+ int i;
+ vhd_parent_locator_t *loc;
+
+ loc = NULL;
+
+ for (i = 0; i < 8; i++) {
+ if (vhd->header.loc[i].code == PLAT_CODE_MACX) {
+ loc = vhd->header.loc + i;
+ break;
+ }
+
+ if (vhd->header.loc[i].code == PLAT_CODE_W2RU)
+ loc = vhd->header.loc + i;
+
+ if (!loc && vhd->header.loc[i].code != PLAT_CODE_NONE)
+ loc = vhd->header.loc + i;
+ }
+
+ return loc;
+}
+
+static inline int
+copy_name(char *dst, const char *src)
+{
+ if (snprintf(dst, VHD_MAX_NAME_LEN, "%s", src) < VHD_MAX_NAME_LEN)
+ return 0;
+
+ return -ENAMETOOLONG;
+}
+
+/*
+ * LVHD stores realpath(parent) in parent locators, so
+ * /dev/<vol-group>/<lv-name> becomes /dev/mapper/<vol--group>-<lv--name>
+ */
+static int
+vhd_util_scan_extract_volume_name(char *dst, const char *src)
+{
+ int err;
+ char copy[VHD_MAX_NAME_LEN], *name, *s, *c;
+
+ name = strrchr(src, '/');
+ if (!name)
+ name = (char *)src;
+
+ /* convert single dashes to slashes, double dashes to single dashes */
+ for (c = copy, s = name; *s != '\0'; s++, c++) {
+ if (*s == '-') {
+ if (s[1] != '-')
+ *c = '/';
+ else {
+ s++;
+ *c = '-';
+ }
+ } else
+ *c = *s;
+ }
+
+ *c = '\0';
+ c = strrchr(copy, '/');
+ if (c == name) {
+ /* unrecognized format */
+ strcpy(dst, src);
+ return -EINVAL;
+ }
+
+ strcpy(dst, ++c);
+ return 0;
+}
+
+static int
+vhd_util_scan_get_volume_parent(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err;
+ char name[VHD_MAX_NAME_LEN];
+ vhd_parent_locator_t *loc, copy;
+
+ if (flags & VHD_SCAN_FAST) {
+ err = vhd_header_decode_parent(vhd,
+ &vhd->header, &image->parent);
+ if (!err)
+ goto found;
+ }
+
+ loc = vhd_util_scan_get_parent_locator(vhd);
+ if (!loc)
+ return -EINVAL;
+
+ copy = *loc;
+ copy.data_offset += image->target->start;
+ err = vhd_parent_locator_read(vhd, ©, &image->parent);
+ if (err)
+ return err;
+
+found:
+ err = vhd_util_scan_extract_volume_name(name, image->parent);
+ if (!err)
+ return copy_name(image->parent, name);
+
+ return 0;
+}
+
+static int
+vhd_util_scan_get_parent(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int i, err;
+ vhd_parent_locator_t *loc;
+
+ if (!target_vhd(image->target->type)) {
+ image->parent = NULL;
+ return 0;
+ }
+
+ loc = NULL;
+
+ if (target_volume(image->target->type))
+ return vhd_util_scan_get_volume_parent(vhd, image);
+
+ if (flags & VHD_SCAN_FAST) {
+ err = vhd_header_decode_parent(vhd,
+ &vhd->header, &image->parent);
+ if (!err)
+ return 0;
+ } else {
+ /*
+ * vhd_parent_locator_get checks for the existence of the
+ * parent file. if this call succeeds, all is well; if not,
+ * we'll try to return whatever string we have before failing
+ * outright.
+ */
+ err = vhd_parent_locator_get(vhd, &image->parent);
+ if (!err)
+ return 0;
+ }
+
+ loc = vhd_util_scan_get_parent_locator(vhd);
+ if (!loc)
+ return -EINVAL;
+
+ return vhd_parent_locator_read(vhd, loc, &image->parent);
+}
+
+static int
+vhd_util_scan_get_hidden(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err, hidden;
+
+ err = 0;
+ hidden = 0;
+
+ if (target_vhd(image->target->type))
+ err = vhd_hidden(vhd, &hidden);
+ else
+ hidden = 1;
+
+ if (err)
+ return err;
+
+ image->hidden = hidden;
+ return 0;
+}
+
+static int
+vhd_util_scan_get_size(vhd_context_t *vhd, struct vhd_image *image)
+{
+ image->size = image->target->size;
+
+ if (target_vhd(image->target->type))
+ image->capacity = vhd->footer.curr_size;
+ else
+ image->capacity = image->size;
+
+ return 0;
+}
+
+static int
+vhd_util_scan_open_file(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err, vhd_flags;
+
+ if (!target_vhd(image->target->type))
+ return 0;
+
+ vhd_flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED;
+ if (flags & VHD_SCAN_FAST)
+ vhd_flags |= VHD_OPEN_FAST;
+
+ err = vhd_open(vhd, image->name, vhd_flags);
+ if (err) {
+ vhd->file = NULL;
+ image->message = "opening file";
+ image->error = err;
+ return image->error;
+ }
+
+ return 0;
+}
+
+static int
+vhd_util_scan_read_volume_headers(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err;
+ char *buf;
+ size_t size;
+ struct target *target;
+
+ buf = NULL;
+ target = image->target;
+ size = sizeof(vhd_footer_t) + sizeof(vhd_header_t);
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ buf = NULL;
+ image->message = "allocating image";
+ image->error = -err;
+ goto out;
+ }
+
+ err = vhd_seek(vhd, target->start, SEEK_SET);
+ if (err) {
+ image->message = "seeking to headers";
+ image->error = err;
+ goto out;
+ }
+
+ err = vhd_read(vhd, buf, size);
+ if (err) {
+ image->message = "reading headers";
+ image->error = err;
+ goto out;
+ }
+
+ memcpy(&vhd->footer, buf, sizeof(vhd_footer_t));
+ vhd_footer_in(&vhd->footer);
+ err = vhd_validate_footer(&vhd->footer);
+ if (err) {
+ image->message = "invalid footer";
+ image->error = err;
+ goto out;
+ }
+
+ /* lvhd vhds should always be dynamic */
+ if (vhd_type_dynamic(vhd)) {
+ if (vhd->footer.data_offset != sizeof(vhd_footer_t))
+ err = vhd_read_header_at(vhd, &vhd->header,
+ vhd->footer.data_offset +
+ target->start);
+ else {
+ memcpy(&vhd->header,
+ buf + sizeof(vhd_footer_t),
+ sizeof(vhd_header_t));
+ vhd_header_in(&vhd->header);
+ err = vhd_validate_header(&vhd->header);
+ }
+
+ if (err) {
+ image->message = "reading header";
+ image->error = err;
+ goto out;
+ }
+
+ vhd->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT;
+ vhd->bm_secs = secs_round_up_no_zero(vhd->spb >> 3);
+ }
+
+out:
+ free(buf);
+ return image->error;
+}
+
+static int
+vhd_util_scan_open_volume(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err;
+ struct target *target;
+
+ target = image->target;
+ memset(vhd, 0, sizeof(*vhd));
+ vhd->oflags = VHD_OPEN_RDONLY | VHD_OPEN_FAST;
+
+ if (target->end - target->start < 4096) {
+ image->message = "device too small";
+ image->error = -EINVAL;
+ return image->error;
+ }
+
+ vhd->file = strdup(image->name);
+ if (!vhd->file) {
+ image->message = "allocating device";
+ image->error = -ENOMEM;
+ return image->error;
+ }
+
+ vhd->fd = open(target->device, O_RDONLY | O_DIRECT | O_LARGEFILE);
+ if (vhd->fd == -1) {
+ free(vhd->file);
+ vhd->file = NULL;
+
+ image->message = "opening device";
+ image->error = -errno;
+ return image->error;
+ }
+
+ if (target_vhd(target->type))
+ return vhd_util_scan_read_volume_headers(vhd, image);
+
+ return 0;
+}
+
+static int
+vhd_util_scan_open(vhd_context_t *vhd, struct vhd_image *image)
+{
+ struct target *target;
+
+ target = image->target;
+
+ if (target_volume(image->target->type) || !(flags & VHD_SCAN_PRETTY))
+ image->name = target->name;
+ else {
+ image->name = realpath(target->name, NULL);
+ if (!image->name) {
+ image->name = target->name;
+ image->message = "resolving name";
+ image->error = -errno;
+ return image->error;
+ }
+ }
+
+ if (target_volume(target->type))
+ return vhd_util_scan_open_volume(vhd, image);
+ else
+ return vhd_util_scan_open_file(vhd, image);
+}
+
+static int
+vhd_util_scan_init_file_target(struct target *target,
+ const char *file, uint8_t type)
+{
+ int err;
+ struct stat stats;
+
+ err = stat(file, &stats);
+ if (err == -1)
+ return -errno;
+
+ err = copy_name(target->name, file);
+ if (err)
+ return err;
+
+ err = copy_name(target->device, file);
+ if (err)
+ return err;
+
+ target->type = type;
+ target->start = 0;
+ target->size = stats.st_size;
+ target->end = stats.st_size;
+
+ return 0;
+}
+
+static int
+vhd_util_scan_init_volume_target(struct target *target,
+ struct lv *lv, uint8_t type)
+{
+ int err;
+
+ if (lv->first_segment.type != LVM_SEG_TYPE_LINEAR)
+ return -ENOSYS;
+
+ err = copy_name(target->name, lv->name);
+ if (err)
+ return err;
+
+ err = copy_name(target->device, lv->first_segment.device);
+ if (err)
+ return err;
+
+ target->type = type;
+ target->size = lv->size;
+ target->start = lv->first_segment.pe_start;
+ target->end = target->start + lv->first_segment.pe_size;
+
+ return 0;
+}
+
+static int
+iterator_init(struct iterator *itr, int cnt, struct target *targets)
+{
+ memset(itr, 0, sizeof(*itr));
+
+ itr->targets = malloc(sizeof(struct target) * cnt);
+ if (!itr->targets)
+ return -ENOMEM;
+
+ memcpy(itr->targets, targets, sizeof(struct target) * cnt);
+
+ itr->cur = 0;
+ itr->cur_size = cnt;
+ itr->max_size = cnt;
+
+ return 0;
+}
+
+static struct target *
+iterator_next(struct iterator *itr)
+{
+ if (itr->cur == itr->cur_size)
+ return NULL;
+
+ return itr->targets + itr->cur++;
+}
+
+static int
+iterator_add_file(struct iterator *itr,
+ struct target *target, const char *parent, uint8_t type)
+{
+ int i;
+ struct target *t;
+ char *lname, *rname;
+
+ for (i = 0; i < itr->cur_size; i++) {
+ t = itr->targets + i;
+ lname = basename((char *)t->name);
+ rname = basename((char *)parent);
+
+ if (!strcmp(lname, rname))
+ return -EEXIST;
+ }
+
+ return vhd_util_scan_init_file_target(target, parent, type);
+}
+
+static int
+iterator_add_volume(struct iterator *itr,
+ struct target *target, const char *parent, uint8_t type)
+{
+ int i, err;
+ struct lv *lv;
+
+ lv = NULL;
+ err = -ENOENT;
+
+ for (i = 0; i < itr->cur_size; i++)
+ if (!strcmp(parent, itr->targets[i].name))
+ return -EEXIST;
+
+ for (i = 0; i < vg.lv_cnt; i++) {
+ err = fnmatch(parent, vg.lvs[i].name, FNM_PATHNAME);
+ if (err != FNM_NOMATCH) {
+ lv = vg.lvs + i;
+ break;
+ }
+ }
+
+ if (err && err != FNM_PATHNAME)
+ return err;
+
+ if (!lv)
+ return -ENOENT;
+
+ return vhd_util_scan_init_volume_target(target, lv, type);
+}
+
+static int
+iterator_add(struct iterator *itr, const char *parent, uint8_t type)
+{
+ int err;
+ struct target *target;
+
+ if (itr->cur_size == itr->max_size) {
+ struct target *new;
+
+ new = realloc(itr->targets,
+ sizeof(struct target) *
+ itr->max_size * 2);
+ if (!new)
+ return -ENOMEM;
+
+ itr->max_size *= 2;
+ itr->targets = new;
+ }
+
+ target = itr->targets + itr->cur_size;
+
+ if (target_volume(type))
+ err = iterator_add_volume(itr, target, parent, type);
+ else
+ err = iterator_add_file(itr, target, parent, type);
+
+ if (err)
+ memset(target, 0, sizeof(*target));
+ else
+ itr->cur_size++;
+
+ return (err == -EEXIST ? 0 : err);
+}
+
+static void
+iterator_free(struct iterator *itr)
+{
+ free(itr->targets);
+ memset(itr, 0, sizeof(*itr));
+}
+
+static void
+vhd_util_scan_add_parent(struct iterator *itr,
+ vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err;
+ uint8_t type;
+
+ if (vhd_parent_raw(vhd))
+ type = target_volume(image->target->type) ?
+ VHD_TYPE_RAW_VOLUME : VHD_TYPE_RAW_FILE;
+ else
+ type = target_volume(image->target->type) ?
+ VHD_TYPE_VHD_VOLUME : VHD_TYPE_VHD_FILE;
+
+ err = iterator_add(itr, image->parent, type);
+ if (err)
+ vhd_util_scan_error(image->parent, err);
+}
+
+static int
+vhd_util_scan_targets(int cnt, struct target *targets)
+{
+ int ret, err;
+ vhd_context_t vhd;
+ struct iterator itr;
+ struct target *target;
+ struct vhd_image image;
+
+ ret = 0;
+ err = 0;
+
+ err = iterator_init(&itr, cnt, targets);
+ if (err)
+ return err;
+
+ while ((target = iterator_next(&itr))) {
+ memset(&vhd, 0, sizeof(vhd));
+ memset(&image, 0, sizeof(image));
+
+ image.target = target;
+
+ err = vhd_util_scan_open(&vhd, &image);
+ if (err) {
+ ret = -EAGAIN;
+ goto end;
+ }
+
+ err = vhd_util_scan_get_size(&vhd, &image);
+ if (err) {
+ ret = -EAGAIN;
+ image.message = "getting physical size";
+ image.error = err;
+ goto end;
+ }
+
+ err = vhd_util_scan_get_hidden(&vhd, &image);
+ if (err) {
+ ret = -EAGAIN;
+ image.message = "checking 'hidden' field";
+ image.error = err;
+ goto end;
+ }
+
+ if (vhd.footer.type == HD_TYPE_DIFF) {
+ err = vhd_util_scan_get_parent(&vhd, &image);
+ if (err) {
+ ret = -EAGAIN;
+ image.message = "getting parent";
+ image.error = err;
+ goto end;
+ }
+ }
+
+ end:
+ vhd_util_scan_print_image(&image);
+
+ if (flags & VHD_SCAN_PARENTS && image.parent)
+ vhd_util_scan_add_parent(&itr, &vhd, &image);
+
+ if (vhd.file)
+ vhd_close(&vhd);
+ if (image.name != target->name)
+ free(image.name);
+ free(image.parent);
+
+ if (err && !(flags & VHD_SCAN_NOFAIL))
+ break;
+ }
+
+ iterator_free(&itr);
+
+ if (flags & VHD_SCAN_NOFAIL)
+ return ret;
+
+ return err;
+}
+
+static int
+vhd_util_scan_targets_pretty(int cnt, struct target *targets)
+{
+ int err;
+
+ err = vhd_util_scan_pretty_allocate_list(cnt);
+ if (err) {
+ printf("scan failed: no memory\n");
+ return -ENOMEM;
+ }
+
+ err = vhd_util_scan_targets(cnt, targets);
+
+ vhd_util_scan_pretty_print_images();
+ vhd_util_scan_pretty_free_list();
+
+ return ((flags & VHD_SCAN_NOFAIL) ? 0 : err);
+}
+
+static int
+vhd_util_scan_find_file_targets(int cnt, char **names,
+ const char *filter,
+ struct target **_targets, int *_total)
+{
+ glob_t g;
+ struct target *targets;
+ int i, globs, err, total;
+
+ total = cnt;
+ globs = 0;
+ *_total = 0;
+ *_targets = NULL;
+
+ memset(&g, 0, sizeof(g));
+
+ if (filter) {
+ int gflags = ((flags & VHD_SCAN_FAST) ? GLOB_NOSORT : 0);
+
+ errno = 0;
+ err = glob(filter, gflags, vhd_util_scan_error, &g);
+
+ switch (err) {
+ case GLOB_NOSPACE:
+ err = -ENOMEM;
+ break;
+ case GLOB_ABORTED:
+ err = -EIO;
+ break;
+ case GLOB_NOMATCH:
+ err = -errno;
+ break;
+ }
+
+ if (err) {
+ vhd_util_scan_error(filter, err);
+ return err;
+ }
+
+ globs = g.gl_pathc;
+ total += globs;
+ }
+
+ targets = calloc(total, sizeof(struct target));
+ if (!targets) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < g.gl_pathc; i++) {
+ err = vhd_util_scan_init_file_target(targets + i,
+ g.gl_pathv[i],
+ VHD_TYPE_VHD_FILE);
+ if (err) {
+ vhd_util_scan_error(g.gl_pathv[i], err);
+ if (!(flags & VHD_SCAN_NOFAIL))
+ goto out;
+ }
+ }
+
+ for (i = 0; i + globs < total; i++) {
+ err = vhd_util_scan_init_file_target(targets + i + globs,
+ names[i],
+ VHD_TYPE_VHD_FILE);
+ if (err) {
+ vhd_util_scan_error(names[i], err);
+ if (!(flags & VHD_SCAN_NOFAIL))
+ goto out;
+ }
+ }
+
+ err = 0;
+ *_total = total;
+ *_targets = targets;
+
+out:
+ if (err)
+ free(targets);
+ if (filter)
+ globfree(&g);
+
+ return err;
+}
+
+static inline void
+swap_volume(struct lv *lvs, int dst, int src)
+{
+ struct lv copy, *ldst, *lsrc;
+
+ if (dst == src)
+ return;
+
+ lsrc = lvs + src;
+ ldst = lvs + dst;
+
+ memcpy(©, ldst, sizeof(copy));
+ memcpy(ldst, lsrc, sizeof(*ldst));
+ memcpy(lsrc, ©, sizeof(copy));
+}
+
+static int
+vhd_util_scan_sort_volumes(struct lv *lvs, int cnt,
+ const char *filter, int *_matches)
+{
+ struct lv *lv;
+ int i, err, matches;
+
+ matches = 0;
+ *_matches = 0;
+
+ if (!filter)
+ return 0;
+
+ for (i = 0; i < cnt; i++) {
+ lv = lvs + i;
+
+ err = fnmatch(filter, lv->name, FNM_PATHNAME);
+ if (err) {
+ if (err != FNM_NOMATCH) {
+ vhd_util_scan_error(lv->name, err);
+ if (!(flags & VHD_SCAN_NOFAIL))
+ return err;
+ }
+
+ continue;
+ }
+
+ swap_volume(lvs, matches++, i);
+ }
+
+ *_matches = matches;
+ return 0;
+}
+
+static int
+vhd_util_scan_find_volume_targets(int cnt, char **names,
+ const char *volume, const char *filter,
+ struct target **_targets, int *_total)
+{
+ struct target *targets;
+ int i, err, total, matches;
+
+ *_total = 0;
+ *_targets = NULL;
+ targets = NULL;
+
+ err = lvm_scan_vg(volume, &vg);
+ if (err)
+ return err;
+
+ err = vhd_util_scan_sort_volumes(vg.lvs, vg.lv_cnt,
+ filter, &matches);
+ if (err)
+ goto out;
+
+ total = matches;
+ for (i = 0; i < cnt; i++) {
+ err = vhd_util_scan_sort_volumes(vg.lvs + total,
+ vg.lv_cnt - total,
+ names[i], &matches);
+ if (err)
+ goto out;
+
+ total += matches;
+ }
+
+ targets = calloc(total, sizeof(struct target));
+ if (!targets) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < total; i++) {
+ err = vhd_util_scan_init_volume_target(targets + i,
+ vg.lvs + i,
+ VHD_TYPE_VHD_VOLUME);
+ if (err) {
+ vhd_util_scan_error(vg.lvs[i].name, err);
+ if (!(flags & VHD_SCAN_NOFAIL))
+ goto out;
+ }
+ }
+
+ err = 0;
+ *_total = total;
+ *_targets = targets;
+
+out:
+ if (err)
+ free(targets);
+ return err;
+}
+
+static int
+vhd_util_scan_find_targets(int cnt, char **names,
+ const char *volume, const char *filter,
+ struct target **targets, int *total)
+{
+ if (flags & VHD_SCAN_VOLUME)
+ return vhd_util_scan_find_volume_targets(cnt, names,
+ volume, filter,
+ targets, total);
+ return vhd_util_scan_find_file_targets(cnt, names,
+ filter, targets, total);
+}
+
+int
+vhd_util_scan(int argc, char **argv)
+{
+ int c, ret, err, cnt;
+ char *filter, *volume;
+ struct target *targets;
+
+ cnt = 0;
+ ret = 0;
+ err = 0;
+ flags = 0;
+ filter = NULL;
+ volume = NULL;
+ targets = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "m:fcl:pavh")) != -1) {
+ switch (c) {
+ case 'm':
+ filter = optarg;
+ break;
+ case 'f':
+ flags |= VHD_SCAN_FAST;
+ break;
+ case 'c':
+ flags |= VHD_SCAN_NOFAIL;
+ break;
+ case 'l':
+ volume = optarg;
+ flags |= VHD_SCAN_VOLUME;
+ break;
+ case 'p':
+ flags |= VHD_SCAN_PRETTY;
+ break;
+ case 'a':
+ flags |= VHD_SCAN_PARENTS;
+ break;
+ case 'v':
+ flags |= VHD_SCAN_VERBOSE;
+ break;
+ case 'h':
+ goto usage;
+ default:
+ err = -EINVAL;
+ goto usage;
+ }
+ }
+
+ if (!filter && argc - optind == 0) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ if (flags & VHD_SCAN_PRETTY)
+ flags &= ~VHD_SCAN_FAST;
+
+ err = vhd_util_scan_find_targets(argc - optind, argv + optind,
+ volume, filter, &targets, &cnt);
+ if (err) {
+ printf("scan failed: %d\n", err);
+ return err;
+ }
+
+ if (!cnt)
+ return 0;
+
+ if (flags & VHD_SCAN_PRETTY)
+ err = vhd_util_scan_targets_pretty(cnt, targets);
+ else
+ err = vhd_util_scan_targets(cnt, targets);
+
+ free(targets);
+ lvm_free_vg(&vg);
+
+ return ((flags & VHD_SCAN_NOFAIL) ? 0 : err);
+
+usage:
+ printf("usage: [OPTIONS] FILES\n"
+ "options: [-m match filter] [-f fast] [-c continue on failure] "
+ "[-l LVM volume] [-p pretty print] [-a scan parents] "
+ "[-v verbose] [-h help]\n");
+ return err;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_set_field(int argc, char **argv)
+{
+ long value;
+ int err, c;
+ off_t eof;
+ vhd_context_t vhd;
+ char *name, *field;
+
+ err = -EINVAL;
+ value = 0;
+ name = NULL;
+ field = NULL;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:f:v:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'f':
+ field = optarg;
+ break;
+ case 'v':
+ err = 0;
+ value = strtol(optarg, NULL, 10);
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || !field || optind != argc || err)
+ goto usage;
+
+ if (strnlen(field, 25) >= 25) {
+ printf("invalid field\n");
+ goto usage;
+ }
+
+ if (strcmp(field, "hidden")) {
+ printf("invalid field %s\n", field);
+ goto usage;
+ }
+
+ if (value < 0 || value > 255) {
+ printf("invalid value %ld\n", value);
+ goto usage;
+ }
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ vhd.footer.hidden = (char)value;
+
+ err = vhd_write_footer(&vhd, &vhd.footer);
+
+ done:
+ vhd_close(&vhd);
+ return err;
+
+usage:
+ printf("options: <-n name> <-f field> <-v value> [-h help]\n");
+ return -EINVAL;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+static int
+vhd_util_find_snapshot_target(const char *name, char **result, int *parent_raw)
+{
+ int i, err;
+ char *target;
+ vhd_context_t vhd;
+
+ *parent_raw = 0;
+ *result = NULL;
+
+ target = strdup(name);
+ if (!target)
+ return -ENOMEM;
+
+ for (;;) {
+ err = vhd_open(&vhd, target, VHD_OPEN_RDONLY);
+ if (err)
+ return err;
+
+ if (vhd.footer.type != HD_TYPE_DIFF)
+ goto out;
+
+ err = vhd_get_bat(&vhd);
+ if (err)
+ goto out;
+
+ for (i = 0; i < vhd.bat.entries; i++)
+ if (vhd.bat.bat[i] != DD_BLK_UNUSED)
+ goto out;
+
+ free(target);
+ err = vhd_parent_locator_get(&vhd, &target);
+ if (err)
+ goto out;
+
+ if (vhd_parent_raw(&vhd)) {
+ *parent_raw = 1;
+ goto out;
+ }
+
+ vhd_close(&vhd);
+ }
+
+out:
+ vhd_close(&vhd);
+ if (err)
+ free(target);
+ else
+ *result = target;
+
+ return err;
+}
+
+static int
+vhd_util_check_depth(const char *name, int *depth)
+{
+ int err;
+ vhd_context_t vhd;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+ if (err)
+ return err;
+
+ err = vhd_chain_depth(&vhd, depth);
+ vhd_close(&vhd);
+
+ return err;
+}
+
+int
+vhd_util_snapshot(int argc, char **argv)
+{
+ vhd_flag_creat_t flags;
+ int c, err, prt_raw, limit;
+ char *name, *pname, *ppath, *backing;
+ uint64_t size;
+ vhd_context_t vhd;
+
+ name = NULL;
+ pname = NULL;
+ ppath = NULL;
+ backing = NULL;
+ size = 0;
+ flags = 0;
+ limit = 0;
+
+ if (!argc || !argv) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:p:l:mh")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'p':
+ pname = optarg;
+ break;
+ case 'l':
+ limit = strtol(optarg, NULL, 10);
+ break;
+ case 'm':
+ vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW);
+ break;
+ case 'h':
+ err = 0;
+ goto usage;
+ default:
+ err = -EINVAL;
+ goto usage;
+ }
+ }
+
+ if (!name || !pname || optind != argc) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ ppath = realpath(pname, NULL);
+ if (!ppath)
+ return -errno;
+
+ if (vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) {
+ backing = strdup(ppath);
+ if (!backing) {
+ err = -ENOMEM;
+ goto out;
+ }
+ } else {
+ err = vhd_util_find_snapshot_target(ppath, &backing, &prt_raw);
+ if (err) {
+ backing = NULL;
+ goto out;
+ }
+
+ /*
+ * if the sizes of the parent chain are non-uniform, we need to
+ * pick the right size: that of the supplied parent
+ */
+ if (strcmp(ppath, backing)) {
+ err = vhd_open(&vhd, ppath, VHD_OPEN_RDONLY);
+ if (err)
+ goto out;
+ size = vhd.footer.curr_size;
+ vhd_close(&vhd);
+ }
+
+ if (prt_raw)
+ vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW);
+ }
+
+ if (limit && !vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) {
+ int depth;
+
+ err = vhd_util_check_depth(backing, &depth);
+ if (err)
+ printf("error checking snapshot depth: %d\n", err);
+ else if (depth + 1 > limit) {
+ err = -ENOSPC;
+ printf("snapshot depth exceeded: "
+ "current depth: %d, limit: %d\n", depth, limit);
+ }
+
+ if (err)
+ goto out;
+ }
+
+ err = vhd_snapshot(name, size, backing, flags);
+
+out:
+ free(ppath);
+ free(backing);
+
+ return err;
+
+usage:
+ printf("options: <-n name> <-p parent name> [-l snapshot depth limit]"
+ " [-m parent_is_raw] [-h help]\n");
+ return err;
+}
--- /dev/null
+ /* Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2011, Citrix
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if defined(__linux__)
+
+#include <uuid/uuid.h>
+
+typedef struct {
+ uuid_t uuid;
+} vhd_uuid_t;
+
+int vhd_uuid_is_nil(vhd_uuid_t *uuid)
+{
+ return uuid_is_null(uuid->uuid);
+}
+
+void vhd_uuid_generate(vhd_uuid_t *uuid)
+{
+ uuid_generate(uuid->uuid);
+}
+
+void vhd_uuid_to_string(vhd_uuid_t *uuid, char *out, size_t size)
+{
+ uuid_unparse(uuid->uuid, out);
+}
+
+void vhd_uuid_from_string(vhd_uuid_t *uuid, const char *in)
+{
+ uuid_parse(in, uuid->uuid);
+}
+
+void vhd_uuid_copy(vhd_uuid_t *dst, vhd_uuid_t *src)
+{
+ uuid_copy(dst->uuid, src->uuid);
+}
+
+void vhd_uuid_clear(vhd_uuid_t *uuid)
+{
+ uuid_clear(uuid->uuid);
+}
+
+int vhd_uuid_compare(vhd_uuid_t *uuid1, vhd_uuid_t *uuid2)
+{
+ return uuid_compare(uuid1->uuid, uuid2->uuid);
+}
+
+#elif defined(__NetBSD__)
+
+#include <uuid.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef uuid_t vhd_uuid_t;
+
+int vhd_uuid_is_nil(vhd_uuid_t *uuid)
+{
+ uint32_t status;
+ return uuid_is_nil((uuid_t *)uuid, &status);
+}
+
+void vhd_uuid_generate(vhd_uuid_t *uuid)
+{
+ uint32_t status;
+ uuid_create((uuid_t *)uuid, &status);
+}
+
+void vhd_uuid_to_string(vhd_uuid_t *uuid, char *out, size_t size)
+{
+ uint32_t status;
+ char *_out = NULL;
+ uuid_to_string((uuid_t *)uuid, &_out, &status);
+ strlcpy(out, _out, size);
+ free(_out);
+}
+
+void vhd_uuid_from_string(vhd_uuid_t *uuid, const char *in)
+{
+ uint32_t status;
+ uuid_from_string(in, (uuid_t *)uuid, &status);
+}
+
+void vhd_uuid_copy(vhd_uuid_t *dst, vhd_uuid_t *src)
+{
+ memcpy((uuid_t *)dst, (uuid_t *)src, sizeof(uuid_t));
+}
+
+void vhd_uuid_clear(vhd_uuid_t *uuid)
+{
+ memset((uuid_t *)uuid, 0, sizeof(uuid_t));
+}
+
+int vhd_uuid_compare(vhd_uuid_t *uuid1, vhd_uuid_t *uuid2)
+{
+ uint32_t status;
+ return uuid_compare((uuid_t *)uuid1, (uuid_t *)uuid2, &status);
+}
+
+#else
+
+#error "Please update vhd-util-uuid.c for your OS"
+
+#endif
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Before updating a VHD file, we create a journal consisting of:
+ * - all data at the beginning of the file, up to and including the BAT
+ * - each allocated bitmap (existing at the same offset in the journal as
+ * its corresponding bitmap in the original file)
+ * Updates are performed in place by writing appropriately
+ * transformed versions of journaled bitmaps to the original file.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "atomicio.h"
+#include "libvhd.h"
+#include "libvhd-journal.h"
+
+static void
+usage(void)
+{
+ printf("usage: vhd-update <-n name> [-j existing journal] [-h]\n");
+ exit(EINVAL);
+}
+
+/*
+ * update vhd creator version to reflect its new bitmap ordering
+ */
+static inline int
+update_creator_version(vhd_journal_t *journal)
+{
+ journal->vhd.footer.crtr_ver = VHD_VERSION(1, 1);
+ return vhd_write_footer(&journal->vhd, &journal->vhd.footer);
+}
+
+static int
+journal_bitmaps(vhd_journal_t *journal)
+{
+ int i, err;
+
+ for (i = 0; i < journal->vhd.bat.entries; i++) {
+ err = vhd_journal_add_block(journal, i, VHD_JOURNAL_METADATA);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * older VHD bitmaps were little endian
+ * and bits within a word were set from right to left
+ */
+static inline int
+old_test_bit(int nr, volatile void * addr)
+{
+ return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
+ (nr % (sizeof(unsigned long)*8))) & 1;
+}
+
+/*
+ * new VHD bitmaps are big endian
+ * and bits within a word are set from left to right
+ */
+#define BIT_MASK 0x80
+static inline void
+new_set_bit (int nr, volatile char *addr)
+{
+ addr[nr >> 3] |= (BIT_MASK >> (nr & 7));
+}
+
+static void
+convert_bitmap(char *in, char *out, int bytes)
+{
+ int i;
+
+ memset(out, 0, bytes);
+
+ for (i = 0; i < bytes << 3; i++)
+ if (old_test_bit(i, (void *)in))
+ new_set_bit(i, out);
+}
+
+static int
+update_vhd(vhd_journal_t *journal, int rollback)
+{
+ int i, err;
+ size_t size;
+ char *buf, *converted;
+
+ buf = NULL;
+ converted = NULL;
+
+ size = vhd_bytes_padded(journal->vhd.spb / 8);
+ err = posix_memalign((void **)&converted, 512, size);
+ if (err) {
+ converted = NULL;
+ goto out;
+ }
+
+ for (i = 0; i < journal->vhd.bat.entries; i++) {
+ if (journal->vhd.bat.bat[i] == DD_BLK_UNUSED)
+ continue;
+
+ err = vhd_read_bitmap(&journal->vhd, i, &buf);
+ if (err)
+ goto out;
+
+ if (rollback)
+ memcpy(converted, buf, size);
+ else
+ convert_bitmap(buf, converted, size);
+
+ free(buf);
+
+ err = vhd_write_bitmap(&journal->vhd, i, converted);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+ out:
+ free(converted);
+ return err;
+}
+
+static int
+open_journal(vhd_journal_t *journal, const char *file, const char *jfile)
+{
+ int err;
+
+ err = vhd_journal_create(journal, file, jfile);
+ if (err) {
+ printf("error creating journal for %s: %d\n", file, err);
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+close_journal(vhd_journal_t *journal, int err)
+{
+ if (err)
+ err = vhd_journal_revert(journal);
+ else
+ err = vhd_journal_commit(journal);
+
+ if (err)
+ return vhd_journal_close(journal);
+ else
+ return vhd_journal_remove(journal);
+}
+
+int
+main(int argc, char **argv)
+{
+ char *file, *jfile;
+ int c, err, rollback;
+ vhd_journal_t journal;
+
+ file = NULL;
+ jfile = NULL;
+ rollback = 0;
+
+ while ((c = getopt(argc, argv, "n:j:rh")) != -1) {
+ switch(c) {
+ case 'n':
+ file = optarg;
+ break;
+ case 'j':
+ jfile = optarg;
+ err = access(jfile, R_OK);
+ if (err == -1) {
+ printf("invalid journal arg %s\n", jfile);
+ return -errno;
+ }
+ break;
+ case 'r':
+ /* add a rollback option for debugging which
+ * pushes journalled bitmaps to original file
+ * without transforming them */
+ rollback = 1;
+ break;
+ default:
+ usage();
+ }
+ }
+
+ if (!file)
+ usage();
+
+ if (rollback && !jfile) {
+ printf("rollback requires a journal argument\n");
+ usage();
+ }
+
+ err = open_journal(&journal, file, jfile);
+ if (err)
+ return err;
+
+ if (!vhd_creator_tapdisk(&journal.vhd) ||
+ journal.vhd.footer.crtr_ver != VHD_VERSION(0, 1) ||
+ journal.vhd.footer.type == HD_TYPE_FIXED) {
+ err = 0;
+ goto out;
+ }
+
+ err = journal_bitmaps(&journal);
+ if (err) {
+ /* no changes to vhd file yet,
+ * so close the journal and bail */
+ vhd_journal_close(&journal);
+ return err;
+ }
+
+ err = update_vhd(&journal, rollback);
+ if (err) {
+ printf("update failed: %d; saving journal\n", err);
+ goto out;
+ }
+
+ err = update_creator_version(&journal);
+ if (err) {
+ printf("failed to udpate creator version: %d\n", err);
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ err = close_journal(&journal, err);
+ return err;
+}
--- /dev/null
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <langinfo.h>
+#include <locale.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf(stdout, _f , ##_a)
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+typedef int (*vhd_util_func_t) (int, char **);
+
+struct command {
+ char *name;
+ vhd_util_func_t func;
+};
+
+struct command commands[] = {
+ { .name = "create", .func = vhd_util_create },
+ { .name = "snapshot", .func = vhd_util_snapshot },
+ { .name = "query", .func = vhd_util_query },
+ { .name = "read", .func = vhd_util_read },
+ { .name = "set", .func = vhd_util_set_field },
+ { .name = "repair", .func = vhd_util_repair },
+ { .name = "resize", .func = vhd_util_resize },
+ { .name = "fill", .func = vhd_util_fill },
+ { .name = "coalesce", .func = vhd_util_coalesce },
+ { .name = "modify", .func = vhd_util_modify },
+ { .name = "scan", .func = vhd_util_scan },
+ { .name = "check", .func = vhd_util_check },
+ { .name = "revert", .func = vhd_util_revert },
+};
+
+#define print_commands() \
+ do { \
+ int i, n; \
+ n = sizeof(commands) / sizeof(struct command); \
+ printf("COMMAND := { "); \
+ printf("%s", commands[0].name); \
+ for (i = 1; i < n; i++) \
+ printf(" | %s", commands[i].name); \
+ printf(" }\n"); \
+ } while (0)
+
+TEST_FAIL_EXTERN_VARS;
+
+void
+help(void)
+{
+ printf("usage: vhd-util COMMAND [OPTIONS]\n");
+ print_commands();
+ exit(0);
+}
+
+struct command *
+get_command(char *command)
+{
+ int i, n;
+
+ if (strnlen(command, 25) >= 25)
+ return NULL;
+
+ n = sizeof(commands) / sizeof (struct command);
+
+ for (i = 0; i < n; i++)
+ if (!strcmp(command, commands[i].name))
+ return &commands[i];
+
+ return NULL;
+}
+
+int
+main(int argc, char *argv[])
+{
+ char **cargv;
+ struct command *cmd;
+ int cargc, i, cnt, ret;
+
+#ifdef CORE_DUMP
+ #include <sys/resource.h>
+ struct rlimit rlim;
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+ if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+ fprintf(stderr, "setrlimit failed: %d\n", errno);
+#endif
+ setlocale(LC_CTYPE, "");
+
+ ret = 0;
+
+ if (argc < 2)
+ help();
+
+ cargc = argc - 1;
+ cmd = get_command(argv[1]);
+ if (!cmd) {
+ fprintf(stderr, "invalid COMMAND %s\n", argv[1]);
+ help();
+ }
+
+ cargv = malloc(sizeof(char *) * cargc);
+ if (!cargv)
+ exit(ENOMEM);
+
+ cnt = 1;
+ cargv[0] = cmd->name;
+ for (i = 1; i < cargc; i++) {
+ char *arg = argv[i + (argc - cargc)];
+
+ if (!strcmp(arg, "--debug")) {
+ libvhd_set_log_level(1);
+ continue;
+ }
+
+ cargv[cnt++] = arg;
+ }
+
+#ifdef ENABLE_FAILURE_TESTING
+ for (i = 0; i < NUM_FAIL_TESTS; i++) {
+ TEST_FAIL[i] = 0;
+ if (getenv(ENV_VAR_FAIL[i]))
+ TEST_FAIL[i] = 1;
+ }
+#endif // ENABLE_FAILURE_TESTING
+
+ ret = cmd->func(cnt, cargv);
+
+ free(cargv);
+
+ return (ret >= 0 ? ret : -ret);
+}